diff --git a/scripts/python/src/fodt/keyword_linker.py b/scripts/python/src/fodt/keyword_linker.py index 5ccbb73d..b083a7ba 100644 --- a/scripts/python/src/fodt/keyword_linker.py +++ b/scripts/python/src/fodt/keyword_linker.py @@ -92,9 +92,13 @@ def __init__( self.in_draw_frame = False # We should not insert links in Figure captions self.in_draw_recursion = 0 # We can have nested draw:frame tags self.content = io.StringIO() - # Create a regex pattern with alternation on the keyword names + # This pattern will be used as part of the regex pattern + self.keyword_pattern = self.compile_keyword_pattern() + # Regex pattern with alternation on the keyword names self.regex = self.compile_regex() + self.current_uri = None # The URI for the current text:a tag self.num_links_inserted = 0 + self.num_uris_updated = 0 # Number of URIs for existing links that have been updated self.office_body_found = False # Set of paragraph styles using fixed width fonts, intialized with the # "_40_Example" style that is used indirectly by the other example styles @@ -111,19 +115,22 @@ def check_mono_paragraph(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None: if style_name in self.mono_paragraph_styles: self.in_mono_paragraph = True + def compile_keyword_pattern(self) -> re.Pattern: + pattern = '|'.join( + # Need to sort the keys by length in descending order to avoid + # matching a substring of a longer keyword. See + # https://github.com/OPM/opm-reference-manual/pull/411#discussion_r1835446631 + sorted((re.escape(k) for k in self.kw_uri_map.keys()), key=len, reverse=True) + ) + return re.compile(pattern) + def compile_regex(self) -> re.Pattern: # Also include the keyword name itself in the regex pattern, see discussion # https://github.com/OPM/opm-reference-manual/pull/410 pattern = re.compile( r'(? re.Pattern: return pattern def characters(self, content: str): + self.check_keyword_link(content) # NOTE: characters() is only called if there is content between the start # tag and the end tag. If there is no content, characters() is not called. if self.start_tag_open: @@ -148,6 +156,21 @@ def characters(self, content: str): if self.in_p and content.startswith("Table "): self.table_caption_info.seen_table_txt = True + def check_keyword_link(self, content: str) -> None: + if self.current_uri is not None: + current_uri = self.current_uri + # Check if the content is a keyword, if so we should insert a link + if self.keyword_pattern.fullmatch(content): + keyword = content + updated_uri = self.kw_uri_map[keyword] + if current_uri != updated_uri: + current_uri = updated_uri + self.num_uris_updated += 1 + # Write the start tag here delayed now as we have checked the content + self.content.write(f'') + self.start_tag_open = False + self.current_uri = None + def collect_example_style(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None: # Collect the paragraph styles that use fixed width fonts if "style:name" in attrs.getNames(): @@ -198,6 +221,9 @@ def get_content(self) -> str: def get_num_links_inserted(self) -> int: return self.num_links_inserted + def get_num_uris_updated(self) -> int: + return self.num_uris_updated + def is_table_caption(self, content: str) -> bool: # Check if the content is a specific table caption, in that case we should not insert links keyword_name = self.file_info @@ -242,6 +268,16 @@ def maybe_collect_mono_paragraph_style( #if len(fontsize) > 0: # Check if the font size is set # self.mono_paragraph_style.libre_mono_font_size = True + def maybe_save_keyword_uri(self, attrs: xml.sax.xmlreader.AttributesImpl) -> bool: + if len(attrs.getNames()) == 1: # Assume a single attribute "xlink:href" + attr = "xlink:href" + if attr in attrs.getNames(): + href = attrs.getValue(attr) + if re.fullmatch(r'#[A-Za-z0-9_ ]+', href): + self.current_uri = href[1:] + return True + return False + def maybe_write_characters(self) -> None: if len(self.char_buf) > 0: # NOTE: We need to escape the content before we apply the regex pattern @@ -307,6 +343,12 @@ def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl): elif name == "text:a": # We are inside an anchor, and we should not insert another text:a tag here self.in_a = True + # This might be a link to an existing keyword. We might need to update its URI + if self.maybe_save_keyword_uri(attrs): + # Delay writing the start tag until we have checked the content of the tag + # in the characters() callback, since we might need to replace the URI attribute + # with a new URI. + return elif name == "text:span": if "text:style-name" in attrs.getNames(): style_name = attrs.getValue("text:style-name") @@ -351,70 +393,87 @@ def __init__( self.kw_uri_map = kw_uri_map self.check_changed = check_changed - def insert_links(self) -> int: + def insert_links(self) -> tuple[int, int]: num_links_inserted = 0 + num_uris_updated = 0 if len(self.chapters) > 0: - num_links_inserted += self.insert_links_in_chapters() + count1, count2 = self.insert_links_in_chapters() + num_links_inserted += count1 + num_uris_updated += count2 if len(self.subsections) > 0: - num_links_inserted += self.insert_links_in_subsections() + count1, count2 = self.insert_links_in_subsections() + num_links_inserted += count1 + num_uris_updated += count2 if len(self.appendices) > 0: - num_links_inserted += self.insert_links_in_appendices() - return num_links_inserted + count1, count2 = self.insert_links_in_appendices() + num_links_inserted += count1 + num_uris_updated += count2 + return num_links_inserted, num_uris_updated - def insert_links_in_chapters(self) -> int: + def insert_links_in_chapters(self) -> tuple[int, int]: start_dir = self.maindir / Directories.chapters num_links_inserted = 0 + num_uris_updated = 0 for chapter in self.chapters: logging.info(f"Processing chapter: {chapter}") filename = f"{chapter}.{FileExtensions.fodt}" path = start_dir / filename - count = self.insert_links_in_file(path, filename, FileType.CHAPTER) - num_links_inserted += count - return num_links_inserted + count1, count2 = self.insert_links_in_file(path, filename, FileType.CHAPTER) + num_links_inserted += count1 + num_uris_updated += count2 + return num_links_inserted, num_uris_updated - def insert_links_in_appendices(self) -> int: + def insert_links_in_appendices(self) -> tuple[int, int]: start_dir = self.maindir / Directories.appendices num_links_inserted = 0 + num_uris_updated = 0 for appendix in self.appendices: logging.info(f"Processing appendix: {appendix}") filename = f"{appendix}.{FileExtensions.fodt}" path = start_dir / filename - count = self.insert_links_in_file(path, filename, FileType.APPENDIX) - num_links_inserted += count - return num_links_inserted + count1, count2 = self.insert_links_in_file(path, filename, FileType.APPENDIX) + num_links_inserted += count1 + num_uris_updated += count2 + return num_links_inserted, num_uris_updated - def insert_links_in_subsections(self) -> int: + def insert_links_in_subsections(self) -> tuple[int, int]: start_dir = self.maindir / Directories.chapters / Directories.subsections num_links_inserted = 0 + num_uris_updated = 0 if self.filename: assert len(self.subsections) == 1 path = start_dir / self.subsections[0] / self.filename keyword_name = self.filename.removesuffix(f".{FileExtensions.fodt}") - num_links_inserted = self.insert_links_in_file(path, keyword_name, FileType.SUBSECTION) + count1, count2 = self.insert_links_in_file(path, keyword_name, FileType.SUBSECTION) + num_links_inserted = count1 + num_uris_updated = count2 else: for subsection in self.subsections: - count = self.insert_links_in_subsection(start_dir, subsection) - num_links_inserted += count - return num_links_inserted + count1, count2 = self.insert_links_in_subsection(start_dir, subsection) + num_links_inserted += count1 + num_uris_updated += count2 + return num_links_inserted, num_uris_updated - def insert_links_in_subsection(self, start_dir: Path, subsection: str) -> int: + def insert_links_in_subsection(self, start_dir: Path, subsection: str) -> tuple[int, int]: files_processed = 0 num_links_inserted = 0 + num_uris_updated = 0 item = start_dir / subsection logging.info(f"Processing subsection: {item.name}") for item2 in item.iterdir(): if item2.suffix == f".{FileExtensions.fodt}": keyword_name = item2.name.removesuffix(f".{FileExtensions.fodt}") files_processed += 1 - count = self.insert_links_in_file( + count1, count2 = self.insert_links_in_file( item2, keyword_name, FileType.SUBSECTION, verbose=False, indent=True ) - num_links_inserted += count + num_links_inserted += count1 + num_uris_updated += count2 if files_processed == 0: logging.info(" No files processed.") else: logging.info(f" Processed {files_processed} files.") - return num_links_inserted + return num_links_inserted, num_uris_updated def insert_links_in_file( self, @@ -423,7 +482,7 @@ def insert_links_in_file( file_type: FileType, verbose: bool = True, indent: bool = False - ) -> int: + ) -> tuple[int, int]: parser = xml.sax.make_parser() handler = FileHandler(file_info, file_type, self.kw_uri_map) parser.setContentHandler(handler) @@ -432,18 +491,25 @@ def insert_links_in_file( except HandlerDoneException as e: pass num_links_inserted = handler.get_num_links_inserted() + num_uris_updated = handler.get_num_uris_updated() indent_str = " " if indent else "" - if num_links_inserted > 0: + if (num_links_inserted > 0) or (num_uris_updated > 0): if self.check_changed: - logging.info(f"{indent_str}{filename.name}: Links would be inserted.") + if num_links_inserted > 0: + logging.info(f"{indent_str}{filename.name}: Links would be inserted.") + if num_uris_updated > 0: + logging.info(f"{indent_str}{filename.name}: URIs would be updated.") else: with open(filename, "w", encoding='utf8') as f: f.write(handler.content.getvalue()) - logging.info(f"{indent_str}{filename.name}: Inserted {num_links_inserted} links.") + if num_links_inserted > 0: + logging.info(f"{indent_str}{filename.name}: Inserted {num_links_inserted} links.") + if num_uris_updated > 0: + logging.info(f"{indent_str}{filename.name}: Updated {num_uris_updated} URIs.") else: if verbose and not self.check_changed: - logging.info(f"{indent_str}{filename.name}: No links inserted.") - return num_links_inserted + logging.info(f"{indent_str}{filename.name}: No links inserted or URIs updated.") + return (num_links_inserted, num_uris_updated) VALID_SUBSECTIONS = "4.3,5.3,6.3,7.3,8.3,9.3,10.3,11.3,12.3" VALID_CHAPTERS = "1,2,3,4,5,6,7,8,9,10,11,12" @@ -577,7 +643,7 @@ def link_keywords( kw_uri_map = keyword_uri_map_generator.get_kw_uri_map(maindir, keyword_dir) else: kw_uri_map = helpers.load_kw_uri_map(maindir) - num_links_inserted = InsertLinks( + num_links_inserted, num_uris_updated = InsertLinks( maindir, subsections, chapters, @@ -587,8 +653,19 @@ def link_keywords( check_changed ).insert_links() if check_changed: - if num_links_inserted > 0: - logging.error(f"Files have changed. {num_links_inserted} links would be inserted.") + if (num_links_inserted > 0) or (num_uris_updated > 0): + extra1 = None + msg = "Files have changed. " + if num_links_inserted > 0: + extra1 = f"{num_links_inserted} links would be inserted" + if num_uris_updated > 0: + if extra1 is not None: + msg += f"{extra1}, and " + msg += f"{num_uris_updated} URIs would be updated." + else: + if extra1 is not None: + msg += f"{extra1}." + logging.error(msg) exit(1) else: logging.info("Files have not changed.")