Skip to content

Commit

Permalink
Merge pull request #131 from JohannesKaufmann/improve-nested-lists
Browse files Browse the repository at this point in the history
improve-nested-lists
  • Loading branch information
JohannesKaufmann authored Dec 26, 2024
2 parents 5a52a80 + 0191495 commit d904841
Show file tree
Hide file tree
Showing 8 changed files with 305 additions and 5 deletions.
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ module github.com/JohannesKaufmann/html-to-markdown/v2
go 1.22.1

require (
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364
github.com/JohannesKaufmann/dom v0.2.0
github.com/agnivade/levenshtein v1.2.0
github.com/andybalholm/cascadia v1.3.2
github.com/andybalholm/cascadia v1.3.3
github.com/muesli/termenv v0.15.2
github.com/sebdah/goldie/v2 v2.5.5
github.com/yuin/goldmark v1.7.8
golang.org/x/net v0.32.0
golang.org/x/net v0.33.0
)

require (
Expand Down
38 changes: 38 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 h1:TDlO/A2QqlNhdvH+hDnu8cv1rouhfHgLwhGzJeHGgFQ=
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364/go.mod h1:U+fBZLZTYiZCOwQUT04V3J4I+0TxyLNnj0R8nBlO4fk=
github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ=
github.com/JohannesKaufmann/dom v0.2.0/go.mod h1:57iSUl5RKric4bUkgos4zu6Xt5LMHUnw3TF1l5CbGZo=
github.com/agnivade/levenshtein v1.2.0 h1:U9L4IOT0Y3i0TIlUIDJ7rVUziKi/zPbrJGaFrtYH3SY=
github.com/agnivade/levenshtein v1.2.0/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
Expand All @@ -13,6 +17,7 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo=
github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
Expand Down Expand Up @@ -44,18 +49,35 @@ github.com/yuin/goldmark v1.7.8 h1:iERMLn0/QJeHFhxSt3p6PeN9mGnvIKSpG9YYorDMnic=
github.com/yuin/goldmark v1.7.8/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
Expand All @@ -64,21 +86,37 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
Expand Down
53 changes: 53 additions & 0 deletions internal/domutils/list_items.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package domutils

import (
"context"
"strings"

"github.com/JohannesKaufmann/dom"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

// MoveListItems moves non-"li" nodes into the previous "li" nodes.
func MoveListItems(ctx context.Context, n *html.Node) {
if n.Type == html.ElementNode && (n.Data == "ol" || n.Data == "ul") {
var previousLi *html.Node

// Collect children to avoid modifying the slice while iterating.
children := dom.AllChildNodes(n)

for _, child := range children {
if child.Type == html.ElementNode && child.Data == "li" {
previousLi = child
} else if child.Type == html.TextNode && strings.TrimSpace(child.Data) == "" {
// Skip the node, probably just formatting of code
} else {
// We expect that inside an "ol"/"ul" there are *only* "li" nodes.
// But sometimes that is not the case...

if previousLi != nil {
// There is a previous "li" node,
// so we move this content into the other "li" node.
n.RemoveChild(child)

previousLi.AppendChild(child)
} else {
// There is no previous "li" node,
// so we wrap this node with it's own "li" node.

newNode := &html.Node{
Type: html.ElementNode,
DataAtom: atom.Li,
Data: "li",
}
previousLi = dom.WrapNode(child, newNode)
}
}
}
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
MoveListItems(ctx, c)
}
}
123 changes: 123 additions & 0 deletions internal/domutils/list_items_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package domutils

import (
"context"
"testing"

"github.com/JohannesKaufmann/html-to-markdown/v2/internal/tester"
)

func TestMoveListItems(t *testing.T) {
runs := []struct {
desc string
input string
expected string
}{
{
desc: "not needed in normal list",
input: "<div><ul><li>A</li><li>B</li><li>C</li></ul></div>",
expected: `
├─body
│ ├─div
│ │ ├─ul
│ │ │ ├─li
│ │ │ │ ├─#text "A"
│ │ │ ├─li
│ │ │ │ ├─#text "B"
│ │ │ ├─li
│ │ │ │ ├─#text "C"
`,
},
{
desc: "#text moves into the previous li",
input: "<ul><li>A</li>B</ul>",
expected: `
├─body
│ ├─ul
│ │ ├─li
│ │ │ ├─#text "A"
│ │ │ ├─#text "B"
`,
},
{
desc: "div moves into the previous li",
input: "<ul><li>A</li><div>B</div></ul>",
expected: `
├─body
│ ├─ul
│ │ ├─li
│ │ │ ├─#text "A"
│ │ │ ├─div
│ │ │ │ ├─#text "B"
`,
},
{
desc: "ol moves into the previous li",
input: "<ul><li>A</li><ol><li>B</li></ol></ul>",
expected: `
├─body
│ ├─ul
│ │ ├─li
│ │ │ ├─#text "A"
│ │ │ ├─ol
│ │ │ │ ├─li
│ │ │ │ │ ├─#text "B"
`,
},
{
desc: "no existing li",
input: "<ul><span>A</span><span>B</span></ul>",
expected: `
├─body
│ ├─ul
│ │ ├─li
│ │ │ ├─span
│ │ │ │ ├─#text "A"
│ │ │ ├─span
│ │ │ │ ├─#text "B"
`,
},
{
desc: "basic moved list",
input: `
<ol>
<li>One</li>
<li>Two</li>
<ol>
<li>Two point one</li>
<li>Two point two</li>
</ol>
</ol>
`,
expected: `
├─body
│ ├─ol
│ │ ├─#text "\n\t"
│ │ ├─li
│ │ │ ├─#text "One"
│ │ ├─#text "\n\t"
│ │ ├─li
│ │ │ ├─#text "Two"
│ │ │ ├─ol
│ │ │ │ ├─#text "\n\t\t"
│ │ │ │ ├─li
│ │ │ │ │ ├─#text "Two point one"
│ │ │ │ ├─#text "\n\t\t"
│ │ │ │ ├─li
│ │ │ │ │ ├─#text "Two point two"
│ │ │ │ ├─#text "\n\t"
│ │ ├─#text "\n\t"
│ │ ├─#text "\n"
`,
},
}
for _, run := range runs {
t.Run(run.desc, func(t *testing.T) {
doc := tester.Parse(t, run.input, "")

MoveListItems(context.TODO(), doc)

tester.ExpectRepresentation(t, doc, "output", run.expected)
})
}
}
5 changes: 4 additions & 1 deletion plugin/commonmark/handle_pre_render.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ func (c *commonmark) handlePreRender(ctx converter.Context, doc *html.Node) {
domutils.RemoveRedundant(doc, nameIsBothLink)
domutils.SwapTags(ctx, doc, nameIsBoldOrItalic, nameIsLink)

// - - - Headings - - - //
// - - - Heading - - - //
domutils.SwapTags(ctx, doc, nameIsLink, nameIsHeading)
domutils.LeafBlockAlternatives(ctx, doc)

// - - - List - - - //
domutils.MoveListItems(ctx, doc)
}
3 changes: 2 additions & 1 deletion plugin/commonmark/testdata/GoldenFiles/link.out.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ Wir freuen uns über eine [Mail](mailto:[email protected]?body=Hello%0AJohannes)!
<!--list with link-->

- [a(b)\[c\]](/page.html)
- [a\]](/page.html)

[a\]](/page.html)

<!--TODO: list with paragraph-->

Expand Down
50 changes: 50 additions & 0 deletions plugin/commonmark/testdata/GoldenFiles/list.in.html
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,56 @@
</div>


<!-- nesting -->
<ol>
<ol>
<ol>
<ol>
<ol>
<li>lots of list containers</li>
</ol>
</ol>
</ol>
</ol>
</ol>

<hr />

<ol>
<li>
<ol>
<li>
<ol>
<li>lots of list items</li>
</ol>
</li>
</ol>
</li>
</ol>


<!-- with other elements inside the list -->
<ol>
<div>A 1 (div)</div>
A 2 (#text)
<li>A 3 (li)</li>
A 4 (#text)

<ol>
<li>B 1 (li)</li>
<ol>
<li>C 1 (li)</li>
<div>C 2 (div)</div>
<div>C 3 (div)</div>
</ol>

<div>B 2 (div)</div>
<li>B 3 (li)</li>
</ol>
</ol>


<!-- with breaks -->
<ul>
<li>
<p>Start Line</p>
Expand Down
Loading

0 comments on commit d904841

Please sign in to comment.