feat: add html text cut pkg

This commit is contained in:
LinkinStar 2022-11-09 17:20:16 +08:00
parent b766824a4e
commit 3d95d03911
4 changed files with 117 additions and 3 deletions

3
go.mod
View File

@ -15,6 +15,7 @@ require (
github.com/goccy/go-json v0.9.11
github.com/google/uuid v1.3.0
github.com/google/wire v0.5.0
github.com/grokify/html-strip-tags-go v0.0.1
github.com/jinzhu/copier v0.3.5
github.com/jinzhu/now v1.1.5
github.com/lib/pq v1.10.7
@ -35,6 +36,7 @@ require (
golang.org/x/crypto v0.1.0
golang.org/x/net v0.1.0
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
gopkg.in/yaml.v3 v3.0.1
xorm.io/builder v0.3.12
xorm.io/core v0.7.3
xorm.io/xorm v1.3.2
@ -110,6 +112,5 @@ require (
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)

4
go.sum
View File

@ -299,6 +299,8 @@ github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51
github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
github.com/grokify/html-strip-tags-go v0.0.1 h1:0fThFwLbW7P/kOiTBs03FsJSV9RM2M/Q/MOnCQxKMo0=
github.com/grokify/html-strip-tags-go v0.0.1/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
@ -594,8 +596,6 @@ github.com/segmentfault/pacman/contrib/cache/memory v0.0.0-20221018072427-a15dd1
github.com/segmentfault/pacman/contrib/cache/memory v0.0.0-20221018072427-a15dd1434e05/go.mod h1:rmf1TCwz67dyM+AmTwSd1BxTo2AOYHj262lP93bOZbs=
github.com/segmentfault/pacman/contrib/conf/viper v0.0.0-20221018072427-a15dd1434e05 h1:BlqTgc3/MYKG6vMI2MI+6o+7P4Gy5PXlawu185wPXAk=
github.com/segmentfault/pacman/contrib/conf/viper v0.0.0-20221018072427-a15dd1434e05/go.mod h1:prPjFam7MyZ5b3S9dcDOt2tMPz6kf7C9c243s9zSwPY=
github.com/segmentfault/pacman/contrib/i18n v0.0.0-20221018072427-a15dd1434e05 h1:gFCY9KUxhYg+/MXNcDYl4ILK+R1SG78FtaSR3JqZNYY=
github.com/segmentfault/pacman/contrib/i18n v0.0.0-20221018072427-a15dd1434e05/go.mod h1:5Afm+OQdau/HQqSOp/ALlSUp0vZsMMMbv//kJhxuoi8=
github.com/segmentfault/pacman/contrib/i18n v0.0.0-20221109042453-26158da67632 h1:so07u8RWXZQ0gz30KXJ9MKtQ5zjgcDlQ/UwFZrwm5b0=
github.com/segmentfault/pacman/contrib/i18n v0.0.0-20221109042453-26158da67632/go.mod h1:5Afm+OQdau/HQqSOp/ALlSUp0vZsMMMbv//kJhxuoi8=
github.com/segmentfault/pacman/contrib/log/zap v0.0.0-20221018072427-a15dd1434e05 h1:jcGZU2juv0L3eFEkuZYV14ESLUlWfGMWnP0mjOfrSZc=

61
pkg/htmltext/htmltext.go Normal file
View File

@ -0,0 +1,61 @@
package htmltext
import (
"regexp"
"strings"
"github.com/grokify/html-strip-tags-go"
)
// ClearText clear HTML, get the clear text
func ClearText(html string) (text string) {
if len(html) == 0 {
text = html
return
}
var (
re *regexp.Regexp
codeReg = `(?ism)<(pre)>.*<\/pre>`
codeRepl = "{code...}"
linkReg = `(?ism)<a.*?[^<]>.*?<\/a>`
linkRepl = "[link]"
spaceReg = ` +`
spaceRepl = " "
)
re = regexp.MustCompile(codeReg)
html = re.ReplaceAllString(html, codeRepl)
re = regexp.MustCompile(linkReg)
html = re.ReplaceAllString(html, linkRepl)
text = strings.NewReplacer(
"\n", " ",
"\r", " ",
"\t", " ",
).Replace(strip.StripTags(html))
// replace multiple spaces to one space
re = regexp.MustCompile(spaceReg)
text = strings.TrimSpace(re.ReplaceAllString(text, spaceRepl))
return
}
// FetchExcerpt return the excerpt from the HTML string
func FetchExcerpt(html, trimMarker string, limit int) (text string) {
if len(html) == 0 {
text = html
return
}
text = ClearText(html)
runeText := []rune(text)
if len(runeText) <= limit {
text = string(runeText)
} else {
text = string(runeText[0:limit])
}
text += trimMarker
return
}

View File

@ -0,0 +1,52 @@
package htmltext
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestClearText(t *testing.T) {
var (
expected,
clearedText string
)
// test code clear text
expected = "hello{code...}"
clearedText = ClearText("<p>hello<pre>var a = \"good\"</pre></p>")
assert.Equal(t, expected, clearedText)
// test link clear text
expected = "hello[link]"
clearedText = ClearText("<p>hello<a href=\"http://example.com/\">example.com</a></p>")
assert.Equal(t, expected, clearedText)
clearedText = ClearText("<p>hello<a href=\"https://example.com/\">example.com</a></p>")
assert.Equal(t, expected, clearedText)
expected = "hello world"
clearedText = ClearText("<div> hello</div>\n<div>world</div>")
assert.Equal(t, expected, clearedText)
}
func TestFetchExcerpt(t *testing.T) {
var (
expected,
text string
)
// test english string
expected = "hello..."
text = FetchExcerpt("<p>hello world</p>", "...", 5)
assert.Equal(t, expected, text)
// test mixed string
expected = "hello你好..."
text = FetchExcerpt("<p>hello你好world</p>", "...", 7)
assert.Equal(t, expected, text)
// test mixed string with emoticon
expected = "hello你好😂..."
text = FetchExcerpt("<p>hello你好😂world</p>", "...", 8)
assert.Equal(t, expected, text)
}