Updated keyword linking script

Updated keyword linking script to also update URI of existings links if needed.
OPM · Jan 17, 2025 · 6a5995f · 6a5995f
1 parent 419ff64
commit 6a5995f
Showing 1 changed file with 116 additions and 39 deletions.
diff --git a/scripts/python/src/fodt/keyword_linker.py b/scripts/python/src/fodt/keyword_linker.py
@@ -92,9 +92,13 @@ def __init__(
         self.in_draw_frame = False  # We should not insert links in Figure captions
         self.in_draw_recursion = 0  # We can have nested draw:frame tags
         self.content = io.StringIO()
-        # Create a regex pattern with alternation on the keyword names
+        # This pattern will be used as part of the regex pattern
+        self.keyword_pattern = self.compile_keyword_pattern()
+        # Regex pattern with alternation on the keyword names
         self.regex = self.compile_regex()
+        self.current_uri = None  # The URI for the current text:a tag
         self.num_links_inserted = 0
+        self.num_uris_updated = 0  # Number of URIs for existing links that have been updated
         self.office_body_found = False
         # Set of paragraph styles using fixed width fonts, intialized with the
         #  "_40_Example" style that is used indirectly by the other example styles
@@ -111,19 +115,22 @@ def check_mono_paragraph(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None:
             if style_name in self.mono_paragraph_styles:
                 self.in_mono_paragraph = True
 
+    def compile_keyword_pattern(self) -> re.Pattern:
+        pattern = '|'.join(
+            # Need to sort the keys by length in descending order to avoid
+            #  matching a substring of a longer keyword. See
+            # https://github.com/OPM/opm-reference-manual/pull/411#discussion_r1835446631
+            sorted((re.escape(k) for k in self.kw_uri_map.keys()), key=len, reverse=True)
+        )
+        return re.compile(pattern)
+
     def compile_regex(self) -> re.Pattern:
         # Also include the keyword name itself in the regex pattern, see discussion
         # https://github.com/OPM/opm-reference-manual/pull/410
         pattern = re.compile(
             r'(?<![.‘"“])'  # Negative lookbehind for a dot or a single/double quote
             r'(?<!&quot;)'    # Negative lookbehind: no HTML double-quote entity before keyword
-            r'\b(' +
-            '|'.join(
-                # Need to sort the keys by length in descending order to avoid
-                #  matching a substring of a longer keyword. See
-                # https://github.com/OPM/opm-reference-manual/pull/411#discussion_r1835446631
-                sorted((re.escape(k) for k in self.kw_uri_map.keys()), key=len, reverse=True)
-                ) +
+            r'\b(' + self.keyword_pattern.pattern +
             # NOTE: We cannot use \b here because if the keyword ends with "-" the word boundary
             #  \b will not match between a space and a hyphen. Instead we use a negative lookahead
             # Negative lookaheads: no word char, "-" or &apos; after the keyword
@@ -132,6 +139,7 @@ def compile_regex(self) -> re.Pattern:
         return pattern
 
     def characters(self, content: str):
+        self.check_keyword_link(content)
         # NOTE: characters() is only called if there is content between the start
         # tag and the end tag. If there is no content, characters() is not called.
         if self.start_tag_open:
@@ -148,6 +156,21 @@ def characters(self, content: str):
         if self.in_p and content.startswith("Table "):
             self.table_caption_info.seen_table_txt = True
 
+    def check_keyword_link(self, content: str) -> None:
+        if self.current_uri is not None:
+            current_uri = self.current_uri
+            # Check if the content is a keyword, if so we should insert a link
+            if self.keyword_pattern.fullmatch(content):
+                keyword = content
+                updated_uri = self.kw_uri_map[keyword]
+                if current_uri != updated_uri:
+                    current_uri = updated_uri
+                    self.num_uris_updated += 1
+            # Write the start tag here delayed now as we have checked the content
+            self.content.write(f'<text:a xlink:href="#{current_uri}">')
+            self.start_tag_open = False
+            self.current_uri = None
+
     def collect_example_style(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None:
         # Collect the paragraph styles that use fixed width fonts
         if "style:name" in attrs.getNames():
@@ -198,6 +221,9 @@ def get_content(self) -> str:
     def get_num_links_inserted(self) -> int:
         return self.num_links_inserted
 
+    def get_num_uris_updated(self) -> int:
+        return self.num_uris_updated
+
     def is_table_caption(self, content: str) -> bool:
         # Check if the content is a specific table caption, in that case we should not insert links
         keyword_name = self.file_info
@@ -242,6 +268,16 @@ def maybe_collect_mono_paragraph_style(
                     #if len(fontsize) > 0:  # Check if the font size is set
                     #    self.mono_paragraph_style.libre_mono_font_size = True
 
+    def maybe_save_keyword_uri(self, attrs: xml.sax.xmlreader.AttributesImpl) -> bool:
+        if len(attrs.getNames()) == 1:  # Assume a single attribute "xlink:href"
+            attr = "xlink:href"
+            if attr in attrs.getNames():
+                href = attrs.getValue(attr)
+                if re.fullmatch(r'#[A-Za-z0-9_ ]+', href):
+                    self.current_uri = href[1:]
+                    return True
+        return False
+
     def maybe_write_characters(self) -> None:
         if len(self.char_buf) > 0:
             # NOTE: We need to escape the content before we apply the regex pattern
@@ -307,6 +343,12 @@ def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
             elif name == "text:a":
                 # We are inside an anchor, and we should not insert another text:a tag here
                 self.in_a = True
+                # This might be a link to an existing keyword. We might need to update its URI
+                if self.maybe_save_keyword_uri(attrs):
+                    # Delay writing the start tag until we have checked the content of the tag
+                    # in the characters() callback, since we might need to replace the URI attribute
+                    # with a new URI.
+                    return
             elif name == "text:span":
                 if "text:style-name" in attrs.getNames():
                     style_name = attrs.getValue("text:style-name")
@@ -351,70 +393,87 @@ def __init__(
         self.kw_uri_map = kw_uri_map
         self.check_changed = check_changed
 
-    def insert_links(self) -> int:
+    def insert_links(self) -> tuple[int, int]:
         num_links_inserted = 0
+        num_uris_updated = 0
         if len(self.chapters) > 0:
-            num_links_inserted += self.insert_links_in_chapters()
+            count1, count2 = self.insert_links_in_chapters()
+            num_links_inserted += count1
+            num_uris_updated += count2
         if len(self.subsections) > 0:
-            num_links_inserted += self.insert_links_in_subsections()
+            count1, count2 = self.insert_links_in_subsections()
+            num_links_inserted += count1
+            num_uris_updated += count2
         if len(self.appendices) > 0:
-            num_links_inserted += self.insert_links_in_appendices()
-        return num_links_inserted
+            count1, count2 = self.insert_links_in_appendices()
+            num_links_inserted += count1
+            num_uris_updated += count2
+        return num_links_inserted, num_uris_updated
 
-    def insert_links_in_chapters(self) -> int:
+    def insert_links_in_chapters(self) -> tuple[int, int]:
         start_dir = self.maindir / Directories.chapters
         num_links_inserted = 0
+        num_uris_updated = 0
         for chapter in self.chapters:
             logging.info(f"Processing chapter: {chapter}")
             filename = f"{chapter}.{FileExtensions.fodt}"
             path = start_dir / filename
-            count = self.insert_links_in_file(path, filename, FileType.CHAPTER)
-            num_links_inserted += count
-        return num_links_inserted
+            count1, count2 = self.insert_links_in_file(path, filename, FileType.CHAPTER)
+            num_links_inserted += count1
+            num_uris_updated += count2
+        return num_links_inserted, num_uris_updated
 
-    def insert_links_in_appendices(self) -> int:
+    def insert_links_in_appendices(self) -> tuple[int, int]:
         start_dir = self.maindir / Directories.appendices
         num_links_inserted = 0
+        num_uris_updated = 0
         for appendix in self.appendices:
             logging.info(f"Processing appendix: {appendix}")
             filename = f"{appendix}.{FileExtensions.fodt}"
             path = start_dir / filename
-            count = self.insert_links_in_file(path, filename, FileType.APPENDIX)
-            num_links_inserted += count
-        return num_links_inserted
+            count1, count2 = self.insert_links_in_file(path, filename, FileType.APPENDIX)
+            num_links_inserted += count1
+            num_uris_updated += count2
+        return num_links_inserted, num_uris_updated
 
-    def insert_links_in_subsections(self) -> int:
+    def insert_links_in_subsections(self) -> tuple[int, int]:
         start_dir = self.maindir / Directories.chapters / Directories.subsections
         num_links_inserted = 0
+        num_uris_updated = 0
         if self.filename:
             assert len(self.subsections) == 1
             path = start_dir / self.subsections[0] / self.filename
             keyword_name = self.filename.removesuffix(f".{FileExtensions.fodt}")
-            num_links_inserted = self.insert_links_in_file(path, keyword_name, FileType.SUBSECTION)
+            count1, count2 = self.insert_links_in_file(path, keyword_name, FileType.SUBSECTION)
+            num_links_inserted = count1
+            num_uris_updated = count2
         else:
             for subsection in self.subsections:
-               count =  self.insert_links_in_subsection(start_dir, subsection)
-               num_links_inserted += count
-        return num_links_inserted
+               count1, count2 =  self.insert_links_in_subsection(start_dir, subsection)
+               num_links_inserted += count1
+               num_uris_updated += count2
+        return num_links_inserted, num_uris_updated
 
-    def insert_links_in_subsection(self, start_dir: Path, subsection: str) -> int:
+    def insert_links_in_subsection(self, start_dir: Path, subsection: str) -> tuple[int, int]:
         files_processed = 0
         num_links_inserted = 0
+        num_uris_updated = 0
         item = start_dir / subsection
         logging.info(f"Processing subsection: {item.name}")
         for item2 in item.iterdir():
             if item2.suffix == f".{FileExtensions.fodt}":
                 keyword_name = item2.name.removesuffix(f".{FileExtensions.fodt}")
                 files_processed += 1
-                count = self.insert_links_in_file(
+                count1, count2 = self.insert_links_in_file(
                     item2, keyword_name, FileType.SUBSECTION, verbose=False, indent=True
                 )
-                num_links_inserted += count
+                num_links_inserted += count1
+                num_uris_updated += count2
         if files_processed == 0:
             logging.info("  No files processed.")
         else:
             logging.info(f"  Processed {files_processed} files.")
-        return num_links_inserted
+        return num_links_inserted, num_uris_updated
 
     def insert_links_in_file(
         self,
@@ -423,7 +482,7 @@ def insert_links_in_file(
         file_type: FileType,
         verbose: bool = True,
         indent: bool = False
-    ) -> int:
+    ) -> tuple[int, int]:
         parser = xml.sax.make_parser()
         handler = FileHandler(file_info, file_type, self.kw_uri_map)
         parser.setContentHandler(handler)
@@ -432,18 +491,25 @@ def insert_links_in_file(
         except HandlerDoneException as e:
             pass
         num_links_inserted = handler.get_num_links_inserted()
+        num_uris_updated = handler.get_num_uris_updated()
         indent_str = "  " if indent else ""
-        if num_links_inserted > 0:
+        if (num_links_inserted > 0) or (num_uris_updated > 0):
             if self.check_changed:
-                logging.info(f"{indent_str}{filename.name}: Links would be inserted.")
+                if num_links_inserted > 0:
+                    logging.info(f"{indent_str}{filename.name}: Links would be inserted.")
+                if num_uris_updated > 0:
+                    logging.info(f"{indent_str}{filename.name}: URIs would be updated.")
             else:
                 with open(filename, "w", encoding='utf8') as f:
                     f.write(handler.content.getvalue())
-                logging.info(f"{indent_str}{filename.name}: Inserted {num_links_inserted} links.")
+                if num_links_inserted > 0:
+                    logging.info(f"{indent_str}{filename.name}: Inserted {num_links_inserted} links.")
+                if num_uris_updated > 0:
+                    logging.info(f"{indent_str}{filename.name}: Updated {num_uris_updated} URIs.")
         else:
             if verbose and not self.check_changed:
-                logging.info(f"{indent_str}{filename.name}: No links inserted.")
-        return num_links_inserted
+                logging.info(f"{indent_str}{filename.name}: No links inserted or URIs updated.")
+        return (num_links_inserted, num_uris_updated)
 
 VALID_SUBSECTIONS = "4.3,5.3,6.3,7.3,8.3,9.3,10.3,11.3,12.3"
 VALID_CHAPTERS = "1,2,3,4,5,6,7,8,9,10,11,12"
@@ -577,7 +643,7 @@ def link_keywords(
         kw_uri_map = keyword_uri_map_generator.get_kw_uri_map(maindir, keyword_dir)
     else:
         kw_uri_map = helpers.load_kw_uri_map(maindir)
-    num_links_inserted = InsertLinks(
+    num_links_inserted, num_uris_updated = InsertLinks(
         maindir,
         subsections,
         chapters,
@@ -587,8 +653,19 @@ def link_keywords(
         check_changed
     ).insert_links()
     if check_changed:
-        if num_links_inserted > 0:
-            logging.error(f"Files have changed. {num_links_inserted} links would be inserted.")
+        if (num_links_inserted > 0) or (num_uris_updated > 0):
+            extra1 = None
+            msg = "Files have changed. "
+            if num_links_inserted > 0:
+                extra1 = f"{num_links_inserted} links would be inserted"
+            if num_uris_updated > 0:
+                if extra1 is not None:
+                    msg += f"{extra1}, and "
+                msg += f"{num_uris_updated} URIs would be updated."
+            else:
+                if extra1 is not None:
+                    msg += f"{extra1}."
+            logging.error(msg)
             exit(1)
         else:
             logging.info("Files have not changed.")