feat: get text excerpt

This commit is contained in:
kumfo 2022-11-07 14:27:54 +08:00
parent f2ac965be3
commit 8f567e0abd
4 changed files with 98 additions and 0 deletions

1
go.mod
View File

@ -15,6 +15,7 @@ require (
github.com/goccy/go-json v0.9.11
github.com/google/uuid v1.3.0
github.com/google/wire v0.5.0
github.com/grokify/html-strip-tags-go v0.0.1
github.com/jinzhu/copier v0.3.5
github.com/jinzhu/now v1.1.5
github.com/lib/pq v1.10.7

2
go.sum
View File

@ -299,6 +299,8 @@ github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51
github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
github.com/grokify/html-strip-tags-go v0.0.1 h1:0fThFwLbW7P/kOiTBs03FsJSV9RM2M/Q/MOnCQxKMo0=
github.com/grokify/html-strip-tags-go v0.0.1/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=

44
pkg/htmltext/htmltext.go Normal file
View File

@ -0,0 +1,44 @@
package htmltext
import (
"github.com/grokify/html-strip-tags-go"
"regexp"
"strings"
)
// ClearText clear HTML, get the clear text
func ClearText(html string) (text string) {
var (
re *regexp.Regexp
codeReg = `(?ism)<(pre)>.*<\/pre>`
codeRepl = "{code...}"
linkReg = `(?ism)<a.*?[^<]>.*?<\/a>`
linkRepl = "[link]"
spaceReg = ` +`
spaceRepl = " "
)
re = regexp.MustCompile(codeReg)
html = re.ReplaceAllString(html, codeRepl)
re = regexp.MustCompile(linkReg)
html = re.ReplaceAllString(html, linkRepl)
text = strings.NewReplacer(
"\n", " ",
"\r", " ",
"\t", " ",
).Replace(strip.StripTags(html))
// replace multiple spaces to one space
re = regexp.MustCompile(spaceReg)
text = strings.TrimSpace(re.ReplaceAllString(text, spaceRepl))
return
}
// FetchExcerpt return the excerpt from the HTML string
func FetchExcerpt(html, trimMarker string, limit int) (text string) {
text = ClearText(html)
runeText := []rune(text)
text = string(runeText[0:limit])
return
}

View File

@ -0,0 +1,51 @@
package htmltext
import (
"github.com/stretchr/testify/assert"
"testing"
)
func TestClearText(t *testing.T) {
var (
expected,
clearedText string
)
// test code clear text
expected = "hello{code...}"
clearedText = ClearText("<p>hello<pre>var a = \"good\"</pre></p>")
assert.Equal(t, expected, clearedText)
// test link clear text
expected = "hello[link]"
clearedText = ClearText("<p>hello<a href=\"http://example.com/\">example.com</a></p>")
assert.Equal(t, expected, clearedText)
clearedText = ClearText("<p>hello<a href=\"https://example.com/\">example.com</a></p>")
assert.Equal(t, expected, clearedText)
expected = "hello world"
clearedText = ClearText("<div> hello</div>\n<div>world</div>")
assert.Equal(t, expected, clearedText)
}
func TestFetchExcerpt(t *testing.T) {
var (
expected,
text string
)
// test english string
expected = "hello"
text = FetchExcerpt("<p>hello world</p>", "...", 5)
assert.Equal(t, expected, text)
// test mixed string
expected = "hello你好"
text = FetchExcerpt("<p>hello你好world</p>", "...", 7)
assert.Equal(t, expected, text)
// test mixed string with emoticon
expected = "hello你好😂"
text = FetchExcerpt("<p>hello你好😂world</p>", "...", 8)
assert.Equal(t, expected, text)
}