Skip to content

Commit

Permalink
fix: allow # in link URLs and prevent unnecessary space, fixes #11
Browse files Browse the repository at this point in the history
  • Loading branch information
Mario Hros committed Nov 1, 2020
1 parent 62431c4 commit 52104c9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
9 changes: 5 additions & 4 deletions html2text.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const (
var lbr = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s*)`)
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
var badLinkHrefRE = regexp.MustCompile(`#|javascript:`)
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`)

Expand Down Expand Up @@ -124,12 +124,14 @@ func HTML2Text(html string) string {
// skip new lines and spaces adding a single space if not there yet
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
writeSpace(outBuf)
if shouldOutput && badTagStackDepth == 0 && !inEnt {
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
writeSpace(outBuf)
}
continue

case r == ';' && inEnt: // end of html entity
inEnt = false
shouldOutput = true
continue

case r == '&' && shouldOutput: // possible html entity
Expand All @@ -156,7 +158,6 @@ func HTML2Text(html string) string {
if ent, isEnt := parseHTMLEntity(entName); isEnt {
outBuf.WriteString(ent)
inEnt = true
shouldOutput = false
continue
}
}
Expand Down
2 changes: 2 additions & 0 deletions html2text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ func TestHTML2Text(t *testing.T) {
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click ents/'x'")
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click ")
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click http://bit.ly/2n4wXRs")
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "/wiki/yet#English, /wiki/not_yet#English")
})

Convey("Inlines", func() {
Expand Down

0 comments on commit 52104c9

Please sign in to comment.