Merge pull request #50 from imjohnbo/treewalker

Fix: target correct link when multiple matches are present
github · Apr 21, 2022 · 287767d · 287767d
2 parents 2906188 + d3ec435
commit 287767d
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 50 deletions.
diff --git a/src/paste-markdown-html.ts b/src/paste-markdown-html.ts
@@ -8,8 +8,6 @@ export function uninstall(el: HTMLElement): void {
   el.removeEventListener('paste', onPaste)
 }
 
-type MarkdownTransformer = (element: HTMLElement | HTMLAnchorElement, args: string[]) => string
-
 function onPaste(event: ClipboardEvent) {
   const transfer = event.clipboardData
   // if there is no clipboard data, or
@@ -20,65 +18,78 @@ function onPaste(event: ClipboardEvent) {
   if (!(field instanceof HTMLTextAreaElement)) return
 
   // Get the plaintext and html version of clipboard contents
-  let text = transfer.getData('text/plain')
+  let plaintext = transfer.getData('text/plain')
   const textHTML = transfer.getData('text/html')
   // Replace Unicode equivalent of "&nbsp" with a space
-  const textHTMLClean = textHTML.replace(/\u00A0/g, ' ')
+  const textHTMLClean = textHTML.replace(/\u00A0/g, ' ').replace(/\uC2A0/g, ' ')
   if (!textHTML) return
 
-  text = text.trim()
-  if (!text) return
+  plaintext = plaintext.trim()
+  if (!plaintext) return
 
   // Generate DOM tree from HTML string
   const parser = new DOMParser()
   const doc = parser.parseFromString(textHTMLClean, 'text/html')
+  const walker = doc.createTreeWalker(doc.body, NodeFilter.SHOW_ELEMENT)
 
-  const a = doc.getElementsByTagName('a')
-  const markdown = transform(a, text, linkify as MarkdownTransformer)
+  const markdown = convertToMarkdown(plaintext, walker)
 
   // If no changes made by transforming
-  if (markdown === text) return
+  if (markdown === plaintext) return
 
   event.stopPropagation()
   event.preventDefault()
 
   insertText(field, markdown)
 }
 
-// Build a markdown string from a DOM tree and plaintext
-function transform(
-  elements: HTMLCollectionOf<HTMLElement>,
-  text: string,
-  transformer: MarkdownTransformer,
-  ...args: string[]
-): string {
-  const markdownParts = []
-  for (const element of elements) {
-    const textContent = element.textContent || ''
-    const {part, index} = trimAfter(text, textContent)
-    if (index >= 0) {
-      markdownParts.push(part.replace(textContent, transformer(element, args)))
-      text = text.slice(index)
+function convertToMarkdown(plaintext: string, walker: TreeWalker): string {
+  let currentNode = walker.firstChild()
+  let markdown = plaintext
+  let markdownIgnoreBeforeIndex = 0
+  let index = 0
+  const NODE_LIMIT = 10000
+
+  // Walk through the DOM tree
+  while (currentNode && index < NODE_LIMIT) {
+    index++
+    const text = isLink(currentNode) ? currentNode.textContent || '' : (currentNode.firstChild as Text)?.wholeText || ''
+
+    // No need to transform whitespace
+    if (isEmptyString(text)) {
+      currentNode = walker.nextNode()
+      continue
+    }
+
+    // Find the index where "text" is found in "markdown" _after_ "markdownIgnoreBeforeIndex"
+    const markdownFoundIndex = markdown.indexOf(text, markdownIgnoreBeforeIndex)
+
+    if (markdownFoundIndex >= 0) {
+      if (isLink(currentNode)) {
+        const markdownLink = linkify(currentNode)
+        // Transform 'example link plus more text' into 'example [link](example link) plus more text'
+        // Method: 'example [link](example link) plus more text' = 'example ' + '[link](example link)' + ' plus more text'
+        markdown =
+          markdown.slice(0, markdownFoundIndex) + markdownLink + markdown.slice(markdownFoundIndex + text.length)
+        markdownIgnoreBeforeIndex = markdownFoundIndex + markdownLink.length
+      } else {
+        markdownIgnoreBeforeIndex = markdownFoundIndex + text.length
+      }
     }
+
+    currentNode = walker.nextNode()
   }
-  markdownParts.push(text)
-  return markdownParts.join('')
-}
 
-// Trim text at index of last character of the first occurrence of "search" and
-// return a new string with the substring until the index
-//  Example: trimAfter('Hello world', 'world') => {part: 'Hello world', index: 11}
-//  Example: trimAfter('Hello world', 'bananas') => {part: '', index: -1}
-function trimAfter(text: string, search = ''): {part: string; index: number} {
-  let index = text.indexOf(search)
-  if (index === -1) return {part: '', index}
+  // Unless we hit the node limit, we should have processed all nodes
+  return index === NODE_LIMIT ? plaintext : markdown
+}
 
-  index += search.length
+function isEmptyString(text: string): boolean {
+  return !text || text?.trim().length === 0
+}
 
-  return {
-    part: text.substring(0, index),
-    index
-  }
+function isLink(node: Node): node is HTMLAnchorElement {
+  return (node as HTMLElement).tagName?.toLowerCase() === 'a' && (node as HTMLElement).hasAttribute('href')
 }
 
 function hasHTML(transfer: DataTransfer): boolean {

diff --git a/test/test.js b/test/test.js
@@ -132,10 +132,10 @@ describe('paste-markdown', function () {
 
     it('turns mixed html content containing several links into appropriate markdown', function () {
       // eslint-disable-next-line github/unescaped-html-literal
-      const sentence = `<meta charset='utf-8'><meta charset="utf-8">
+      const sentence = `<meta charset='utf-8'>
         <b style="font-weight:normal;"><p dir="ltr"><span>This is a </span>
-        <a href="https://github.com/"><span>link</span></a><span> and </span>
-        <a href="https://www.youtube.com/watch?v=dQw4w9WgXcQ"><span>another link</span></a></p>
+        <a href="https://github.com/">link</a><span> and </span>
+        <a href="https://www.youtube.com/watch?v=dQw4w9WgXcQ">another link</a></p>
         <br /><a href="https://github.com/"><span>Link</span></a><span> at the beginning, link at the </span>
         <a href="https://github.com/"><span>end</span></a></b>`
       // eslint-disable-next-line i18n-text/no-en
@@ -186,19 +186,29 @@ describe('paste-markdown', function () {
 
     it('leaves plaintext links alone', function () {
       // eslint-disable-next-line github/unescaped-html-literal
-      const sentence = `<meta charset='utf-8'><meta charset="utf-8">
+      const sentence = `<meta charset='utf-8'>
         <b style="font-weight:normal;"><p dir="ltr"><span>This is a </span>
-        <a href="https://github.com/"><span>https://github.com</span></a><span> and </span>
-        <a href="https://www.youtube.com/watch?v=dQw4w9WgXcQ"><span>another link</span></a></p>
-        <br /><a href="https://github.com/"><span>Link</span></a><span> at the beginning, link at the </span>
-        <a href="https://github.com/"><span>https://github.com/</span></a></b>`
+        <a href="https://github.com/">link</a><span> and </span>
+        <a href="https://www.youtube.com/watch?v=dQw4w9WgXcQ">another link</a></p>
+        <br /><a href="https://github.com/">Link</a><span> at the beginning, link at the </span>
+        <a href="https://github.com/"><span>end</span></a></b>`
       /* eslint-disable i18n-text/no-en */
-      const plaintextSentence =
-        'This is a https://github.com and another link\n\nLink at the beginning, link at the https://github.com/'
+      const plaintextSentence = 'This is a link and another link\n\nLink at the beginning, link at the end'
       /* eslint-enable i18n-text/no-en */
       const markdownSentence =
-        'This is a https://github.com/ and [another link](https://www.youtube.com/watch?v=dQw4w9WgXcQ)\n\n' +
-        '[Link](https://github.com/) at the beginning, link at the https://github.com/'
+        'This is a [link](https://github.com/) and [another link](https://www.youtube.com/watch?v=dQw4w9WgXcQ)\n\n' +
+        '[Link](https://github.com/) at the beginning, link at the [end](https://github.com/)'
+
+      paste(textarea, {'text/html': sentence, 'text/plain': plaintextSentence})
+      assert.equal(textarea.value, markdownSentence)
+    })
+
+    it('finds the right link when identical labels are present', function () {
+      // eslint-disable-next-line github/unescaped-html-literal
+      const sentence = `<meta charset='utf-8'><span>example<span> </span>
+      </span><a href="https://example.com/">example</a>`
+      const plaintextSentence = 'example example'
+      const markdownSentence = 'example [example](https://example.com/)'
 
       paste(textarea, {'text/html': sentence, 'text/plain': plaintextSentence})
       assert.equal(textarea.value, markdownSentence)