Skip to content

Commit

Permalink
Updated keyword linking script
Browse files Browse the repository at this point in the history
Updated keyword linking script to also update URI of existings links if
needed.
  • Loading branch information
hakonhagland committed Jan 13, 2025
1 parent 2513f96 commit ef26576
Showing 1 changed file with 116 additions and 39 deletions.
155 changes: 116 additions & 39 deletions scripts/python/src/fodt/keyword_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,13 @@ def __init__(
self.in_draw_frame = False # We should not insert links in Figure captions
self.in_draw_recursion = 0 # We can have nested draw:frame tags
self.content = io.StringIO()
# Create a regex pattern with alternation on the keyword names
# This pattern will be used as part of the regex pattern
self.keyword_pattern = self.compile_keyword_pattern()
# Regex pattern with alternation on the keyword names
self.regex = self.compile_regex()
self.current_uri = None # The URI for the current text:a tag
self.num_links_inserted = 0
self.num_uris_updated = 0 # Number of URIs for existing links that have been updated
self.office_body_found = False
# Set of paragraph styles using fixed width fonts, intialized with the
# "_40_Example" style that is used indirectly by the other example styles
Expand All @@ -111,19 +115,22 @@ def check_mono_paragraph(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None:
if style_name in self.mono_paragraph_styles:
self.in_mono_paragraph = True

def compile_keyword_pattern(self) -> re.Pattern:
pattern = '|'.join(
# Need to sort the keys by length in descending order to avoid
# matching a substring of a longer keyword. See
# https://github.com/OPM/opm-reference-manual/pull/411#discussion_r1835446631
sorted((re.escape(k) for k in self.kw_uri_map.keys()), key=len, reverse=True)
)
return re.compile(pattern)

def compile_regex(self) -> re.Pattern:
# Also include the keyword name itself in the regex pattern, see discussion
# https://github.com/OPM/opm-reference-manual/pull/410
pattern = re.compile(
r'(?<![.‘"“])' # Negative lookbehind for a dot or a single/double quote
r'(?<!&quot;)' # Negative lookbehind: no HTML double-quote entity before keyword
r'\b(' +
'|'.join(
# Need to sort the keys by length in descending order to avoid
# matching a substring of a longer keyword. See
# https://github.com/OPM/opm-reference-manual/pull/411#discussion_r1835446631
sorted((re.escape(k) for k in self.kw_uri_map.keys()), key=len, reverse=True)
) +
r'\b(' + self.keyword_pattern.pattern +
# NOTE: We cannot use \b here because if the keyword ends with "-" the word boundary
# \b will not match between a space and a hyphen. Instead we use a negative lookahead
# Negative lookaheads: no word char, "-" or &apos; after the keyword
Expand All @@ -132,6 +139,7 @@ def compile_regex(self) -> re.Pattern:
return pattern

def characters(self, content: str):
self.check_keyword_link(content)
# NOTE: characters() is only called if there is content between the start
# tag and the end tag. If there is no content, characters() is not called.
if self.start_tag_open:
Expand All @@ -148,6 +156,21 @@ def characters(self, content: str):
if self.in_p and content.startswith("Table "):
self.table_caption_info.seen_table_txt = True

def check_keyword_link(self, content: str) -> None:
if self.current_uri is not None:
current_uri = self.current_uri
# Check if the content is a keyword, if so we should insert a link
if self.keyword_pattern.fullmatch(content):
keyword = content
updated_uri = self.kw_uri_map[keyword]
if current_uri != updated_uri:
current_uri = updated_uri
self.num_uris_updated += 1
# Write the start tag here delayed now as we have checked the content
self.content.write(f'<text:a xlink:href="#{current_uri}">')
self.start_tag_open = False
self.current_uri = None

def collect_example_style(self, attrs: xml.sax.xmlreader.AttributesImpl) -> None:
# Collect the paragraph styles that use fixed width fonts
if "style:name" in attrs.getNames():
Expand Down Expand Up @@ -198,6 +221,9 @@ def get_content(self) -> str:
def get_num_links_inserted(self) -> int:
return self.num_links_inserted

def get_num_uris_updated(self) -> int:
return self.num_uris_updated

def is_table_caption(self, content: str) -> bool:
# Check if the content is a specific table caption, in that case we should not insert links
keyword_name = self.file_info
Expand Down Expand Up @@ -242,6 +268,16 @@ def maybe_collect_mono_paragraph_style(
#if len(fontsize) > 0: # Check if the font size is set
# self.mono_paragraph_style.libre_mono_font_size = True

def maybe_save_keyword_uri(self, attrs: xml.sax.xmlreader.AttributesImpl) -> bool:
if len(attrs.getNames()) == 1: # Assume a single attribute "xlink:href"
attr = "xlink:href"
if attr in attrs.getNames():
href = attrs.getValue(attr)
if re.fullmatch(r'#[A-Za-z0-9_ ]+', href):
self.current_uri = href[1:]
return True
return False

def maybe_write_characters(self) -> None:
if len(self.char_buf) > 0:
# NOTE: We need to escape the content before we apply the regex pattern
Expand Down Expand Up @@ -307,6 +343,12 @@ def startElement(self, name:str, attrs: xml.sax.xmlreader.AttributesImpl):
elif name == "text:a":
# We are inside an anchor, and we should not insert another text:a tag here
self.in_a = True
# This might be a link to an existing keyword. We might need to update its URI
if self.maybe_save_keyword_uri(attrs):
# Delay writing the start tag until we have checked the content of the tag
# in the characters() callback, since we might need to replace the URI attribute
# with a new URI.
return
elif name == "text:span":
if "text:style-name" in attrs.getNames():
style_name = attrs.getValue("text:style-name")
Expand Down Expand Up @@ -351,70 +393,87 @@ def __init__(
self.kw_uri_map = kw_uri_map
self.check_changed = check_changed

def insert_links(self) -> int:
def insert_links(self) -> tuple[int, int]:
num_links_inserted = 0
num_uris_updated = 0
if len(self.chapters) > 0:
num_links_inserted += self.insert_links_in_chapters()
count1, count2 = self.insert_links_in_chapters()
num_links_inserted += count1
num_uris_updated += count2
if len(self.subsections) > 0:
num_links_inserted += self.insert_links_in_subsections()
count1, count2 = self.insert_links_in_subsections()
num_links_inserted += count1
num_uris_updated += count2
if len(self.appendices) > 0:
num_links_inserted += self.insert_links_in_appendices()
return num_links_inserted
count1, count2 = self.insert_links_in_appendices()
num_links_inserted += count1
num_uris_updated += count2
return num_links_inserted, num_uris_updated

def insert_links_in_chapters(self) -> int:
def insert_links_in_chapters(self) -> tuple[int, int]:
start_dir = self.maindir / Directories.chapters
num_links_inserted = 0
num_uris_updated = 0
for chapter in self.chapters:
logging.info(f"Processing chapter: {chapter}")
filename = f"{chapter}.{FileExtensions.fodt}"
path = start_dir / filename
count = self.insert_links_in_file(path, filename, FileType.CHAPTER)
num_links_inserted += count
return num_links_inserted
count1, count2 = self.insert_links_in_file(path, filename, FileType.CHAPTER)
num_links_inserted += count1
num_uris_updated += count2
return num_links_inserted, num_uris_updated

def insert_links_in_appendices(self) -> int:
def insert_links_in_appendices(self) -> tuple[int, int]:
start_dir = self.maindir / Directories.appendices
num_links_inserted = 0
num_uris_updated = 0
for appendix in self.appendices:
logging.info(f"Processing appendix: {appendix}")
filename = f"{appendix}.{FileExtensions.fodt}"
path = start_dir / filename
count = self.insert_links_in_file(path, filename, FileType.APPENDIX)
num_links_inserted += count
return num_links_inserted
count1, count2 = self.insert_links_in_file(path, filename, FileType.APPENDIX)
num_links_inserted += count1
num_uris_updated += count2
return num_links_inserted, num_uris_updated

def insert_links_in_subsections(self) -> int:
def insert_links_in_subsections(self) -> tuple[int, int]:
start_dir = self.maindir / Directories.chapters / Directories.subsections
num_links_inserted = 0
num_uris_updated = 0
if self.filename:
assert len(self.subsections) == 1
path = start_dir / self.subsections[0] / self.filename
keyword_name = self.filename.removesuffix(f".{FileExtensions.fodt}")
num_links_inserted = self.insert_links_in_file(path, keyword_name, FileType.SUBSECTION)
count1, count2 = self.insert_links_in_file(path, keyword_name, FileType.SUBSECTION)
num_links_inserted = count1
num_uris_updated = count2
else:
for subsection in self.subsections:
count = self.insert_links_in_subsection(start_dir, subsection)
num_links_inserted += count
return num_links_inserted
count1, count2 = self.insert_links_in_subsection(start_dir, subsection)
num_links_inserted += count1
num_uris_updated += count2
return num_links_inserted, num_uris_updated

def insert_links_in_subsection(self, start_dir: Path, subsection: str) -> int:
def insert_links_in_subsection(self, start_dir: Path, subsection: str) -> tuple[int, int]:
files_processed = 0
num_links_inserted = 0
num_uris_updated = 0
item = start_dir / subsection
logging.info(f"Processing subsection: {item.name}")
for item2 in item.iterdir():
if item2.suffix == f".{FileExtensions.fodt}":
keyword_name = item2.name.removesuffix(f".{FileExtensions.fodt}")
files_processed += 1
count = self.insert_links_in_file(
count1, count2 = self.insert_links_in_file(
item2, keyword_name, FileType.SUBSECTION, verbose=False, indent=True
)
num_links_inserted += count
num_links_inserted += count1
num_uris_updated += count2
if files_processed == 0:
logging.info(" No files processed.")
else:
logging.info(f" Processed {files_processed} files.")
return num_links_inserted
return num_links_inserted, num_uris_updated

def insert_links_in_file(
self,
Expand All @@ -423,7 +482,7 @@ def insert_links_in_file(
file_type: FileType,
verbose: bool = True,
indent: bool = False
) -> int:
) -> tuple[int, int]:
parser = xml.sax.make_parser()
handler = FileHandler(file_info, file_type, self.kw_uri_map)
parser.setContentHandler(handler)
Expand All @@ -432,18 +491,25 @@ def insert_links_in_file(
except HandlerDoneException as e:
pass
num_links_inserted = handler.get_num_links_inserted()
num_uris_updated = handler.get_num_uris_updated()
indent_str = " " if indent else ""
if num_links_inserted > 0:
if (num_links_inserted > 0) or (num_uris_updated > 0):
if self.check_changed:
logging.info(f"{indent_str}{filename.name}: Links would be inserted.")
if num_links_inserted > 0:
logging.info(f"{indent_str}{filename.name}: Links would be inserted.")
if num_uris_updated > 0:
logging.info(f"{indent_str}{filename.name}: URIs would be updated.")
else:
with open(filename, "w", encoding='utf8') as f:
f.write(handler.content.getvalue())
logging.info(f"{indent_str}{filename.name}: Inserted {num_links_inserted} links.")
if num_links_inserted > 0:
logging.info(f"{indent_str}{filename.name}: Inserted {num_links_inserted} links.")
if num_uris_updated > 0:
logging.info(f"{indent_str}{filename.name}: Updated {num_uris_updated} URIs.")
else:
if verbose and not self.check_changed:
logging.info(f"{indent_str}{filename.name}: No links inserted.")
return num_links_inserted
logging.info(f"{indent_str}{filename.name}: No links inserted or URIs updated.")
return (num_links_inserted, num_uris_updated)

VALID_SUBSECTIONS = "4.3,5.3,6.3,7.3,8.3,9.3,10.3,11.3,12.3"
VALID_CHAPTERS = "1,2,3,4,5,6,7,8,9,10,11,12"
Expand Down Expand Up @@ -577,7 +643,7 @@ def link_keywords(
kw_uri_map = keyword_uri_map_generator.get_kw_uri_map(maindir, keyword_dir)
else:
kw_uri_map = helpers.load_kw_uri_map(maindir)
num_links_inserted = InsertLinks(
num_links_inserted, num_uris_updated = InsertLinks(
maindir,
subsections,
chapters,
Expand All @@ -587,8 +653,19 @@ def link_keywords(
check_changed
).insert_links()
if check_changed:
if num_links_inserted > 0:
logging.error(f"Files have changed. {num_links_inserted} links would be inserted.")
if (num_links_inserted > 0) or (num_uris_updated > 0):
extra1 = None
msg = "Files have changed. "
if num_links_inserted > 0:
extra1 = f"{num_links_inserted} links would be inserted"
if num_uris_updated > 0:
if extra1 is not None:
msg += f"{extra1}, and "
msg += f"{num_uris_updated} URIs would be updated."
else:
if extra1 is not None:
msg += f"{extra1}."
logging.error(msg)
exit(1)
else:
logging.info("Files have not changed.")
Expand Down

0 comments on commit ef26576

Please sign in to comment.