Merge pull request #19 from ahlec/strip-furigana-before-reading

Remove <ruby> furigana before generating readings
obynio · Oct 7, 2022 · 117ab41 · 117ab41
2 parents 6213c22 + e7651c3
commit 117ab41
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 18 deletions.
diff --git a/__init__.py b/__init__.py
@@ -15,7 +15,6 @@
 # You should have received a copy of the GNU General Public License
 # along with Japanese Furigana.  If not, see <http://www.gnu.org/licenses/>.
 
-import re
 import os
 
 from aqt.utils import tooltip
@@ -29,6 +28,7 @@
 from . import reading
 from . import config
 from .selection import Selection
+from .utils import removeFurigana
 
 mecab  = reading.MecabController()
 config = config.Config()
@@ -57,31 +57,19 @@ def doIt(editor, action):
 
 def generateFurigana(editor, s):
     html = s.selected
-    html = re.sub('\[[^\]]*\]', '', html)
+    html = removeFurigana(html)
     html = mecab.reading(html, config.getIgnoreNumbers(), config.getUseRubyTags())
     if html == s.selected:
         tooltip("Nothing to generate!")
     else:
         s.modify(html)
 
 def deleteFurigana(editor, s):
-    html = s.selected
-    if config.getUseRubyTags():
-        betweens = list(map(lambda x: "<ruby>"+x+"</ruby>", re.findall(r"<ruby>(.*?)<\/ruby>", html)))
-        if len(betweens) == 0:
-            tooltip("No furigana found to delete")
-        else:
-            for b in betweens:
-                replacement = re.search(r"<ruby>(.*?)<rp>",b).group(1).strip()
-                html = html.replace(b, replacement)
-            s.modify(html)
+    stripped = removeFurigana(s.selected)
+    if stripped == s.selected:
+        tooltip("No furigana found to delete")
     else:
-        html, deletions = re.subn('\[[^\]]*\]', '', html)
-
-        if deletions == 0:
-            tooltip("No furigana found to delete")
-        else:
-            s.modify(html)
+        s.modify(stripped)
 
 setupGuiMenu()
 addHook("setupEditorButtons", addButtons)
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -0,0 +1,34 @@
+import unittest
+
+import utils
+
+class TestRemoveFurigana(unittest.TestCase):
+
+    # empty string should return empty string
+    def testEmptyString(self):
+        self.assertEqual(utils.removeFurigana(""), "")
+
+    # ensure that bracket notation is correctly removed
+    def testRemovesBrackets(self):
+        self.assertEqual(utils.removeFurigana("日本語[にほんご]を勉強[べんきょう]する"), "日本語を勉強する")
+        self.assertEqual(utils.removeFurigana("走[はし]り込[こ]む"), "走り込む")
+
+    # ensure that ruby tags are correctly removed
+    def testRemovesRuby(self):
+        self.assertEqual(utils.removeFurigana("<ruby>日本語<rp>(</rp><rt>にほんご</rt><rp>)</rp></ruby>を<ruby>勉強<rp>(</rp><rt>べんきょう</rt><rp>)</rp></ruby>する"), "日本語を勉強する")
+        self.assertEqual(utils.removeFurigana("<ruby>走<rp>(</rp><rt>はし</rt><rp>)</rp></ruby>り<ruby>込<rp>(</rp><rt>こ</rt><rp>)</rp></ruby>む"), "走り込む")
+
+    # ensure that <ruby /> tags without the inessential <rp /> tags are stripped
+    def testRemovesRubyWithoutRp(self):
+        self.assertEqual(utils.removeFurigana("<ruby>日本語<rt>にほんご</rt></ruby>を<ruby>勉強<rt>べんきょう</rt></ruby>する"), "日本語を勉強する")
+        self.assertEqual(utils.removeFurigana("<ruby>走<rt>はし</rt></ruby>り<ruby>込<rt>こ</rt></ruby>む"), "走り込む")
+
+    # ensure that non-<ruby> related HTML tags are preserved
+    def testPreservesOtherHtml(self):
+        self.assertEqual(utils.removeFurigana("<b>日本語</b>"), "<b>日本語</b>")
+        self.assertEqual(utils.removeFurigana("ビルの<ruby>形<rp>(</rp><rt>かたち</rt><rp>)</rp></ruby>はほぼ<b><u><ruby>正方形<rp>(</rp><rt>せいほうけい</rt><rp>)</rp></ruby></u></b>だった。"), "ビルの形はほぼ<b><u>正方形</u></b>だった。")
+
+    # ensure that the utility function will remove both styles from the same string
+    # (which also ensures that we're decoupled from the user's current config selection)
+    def testRemovesBothNotations(self):
+        self.assertEqual(utils.removeFurigana("<ruby>日本語<rp>(</rp><rt>にほんご</rt><rp>)</rp></ruby>を勉強[べんきょう]する"), "日本語を勉強する")
diff --git a/utils.py b/utils.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+def removeFurigana(text: str):
+    stripped = text
+
+    # First, remove Ruby tags
+    rubyTags: list[str] = re.findall(r"<ruby>(.*?)<\/ruby>", stripped)
+    for ruby in rubyTags:
+        # Figure out what the actual body of the <ruby /> tag is.
+        # Current approach: strip away any HTML tags that handle the annotation, to
+        # arrive at just the body. Considering only the current HTML specification,
+        # the tags to strip away are: <rp>, <rt>
+        body = re.sub(r"<rp>(.*?)<\/rp>|<rt>(.*?)<\/rt>", "", ruby)
+
+        # Replace the entire <ruby> block with just the body.
+        # NOTE: We'll need to include the <ruby> tags around the search string, since
+        # they aren't included in the original regex response
+        stripped = stripped.replace("<ruby>" + ruby + "</ruby>", body)
+
+    # Next, remove the bracket notation
+    stripped, _ = re.subn('\[[^\]]*\]', '', stripped)
+
+    # Return the final string
+    return stripped