feat: get text excerpt

2022-11-07 14:27:54 +08:00 · 2022-11-07 14:27:54 +08:00 · 8f567e0abd
parent f2ac965be3
commit 8f567e0abd
4 changed files with 98 additions and 0 deletions
--- a/go.mod
+++ b/go.mod
@ -15,6 +15,7 @@ require (
 	github.com/goccy/go-json v0.9.11
 	github.com/google/uuid v1.3.0
 	github.com/google/wire v0.5.0
+	github.com/grokify/html-strip-tags-go v0.0.1
 	github.com/jinzhu/copier v0.3.5
 	github.com/jinzhu/now v1.1.5
 	github.com/lib/pq v1.10.7
--- a/go.sum
+++ b/go.sum
@ -299,6 +299,8 @@ github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51
 github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
 github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
 github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
+github.com/grokify/html-strip-tags-go v0.0.1 h1:0fThFwLbW7P/kOiTBs03FsJSV9RM2M/Q/MOnCQxKMo0=
+github.com/grokify/html-strip-tags-go v0.0.1/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
 github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
 github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY=
--- a/pkg/htmltext/htmltext.go
+++ b/pkg/htmltext/htmltext.go
@ -0,0 +1,44 @@
+package htmltext
+
+import (
+	"github.com/grokify/html-strip-tags-go"
+	"regexp"
+	"strings"
+)
+
+// ClearText clear HTML, get the clear text
+func ClearText(html string) (text string) {
+	var (
+		re        *regexp.Regexp
+		codeReg   = `(?ism)<(pre)>.*<\/pre>`
+		codeRepl  = "{code...}"
+		linkReg   = `(?ism)<a.*?[^<]>.*?<\/a>`
+		linkRepl  = "[link]"
+		spaceReg  = ` +`
+		spaceRepl = " "
+	)
+	re = regexp.MustCompile(codeReg)
+	html = re.ReplaceAllString(html, codeRepl)
+
+	re = regexp.MustCompile(linkReg)
+	html = re.ReplaceAllString(html, linkRepl)
+
+	text = strings.NewReplacer(
+		"\n", " ",
+		"\r", " ",
+		"\t", " ",
+	).Replace(strip.StripTags(html))
+
+	// replace multiple spaces to one space
+	re = regexp.MustCompile(spaceReg)
+	text = strings.TrimSpace(re.ReplaceAllString(text, spaceRepl))
+	return
+}
+
+// FetchExcerpt return the excerpt from the HTML string
+func FetchExcerpt(html, trimMarker string, limit int) (text string) {
+	text = ClearText(html)
+	runeText := []rune(text)
+	text = string(runeText[0:limit])
+	return
+}
--- a/pkg/htmltext/htmltext_test.go
+++ b/pkg/htmltext/htmltext_test.go
@ -0,0 +1,51 @@
+package htmltext
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func TestClearText(t *testing.T) {
+	var (
+		expected,
+		clearedText string
+	)
+
+	// test code clear text
+	expected = "hello{code...}"
+	clearedText = ClearText("<p>hello<pre>var a = \"good\"</pre></p>")
+	assert.Equal(t, expected, clearedText)
+
+	// test link clear text
+	expected = "hello[link]"
+	clearedText = ClearText("<p>hello<a href=\"http://example.com/\">example.com</a></p>")
+	assert.Equal(t, expected, clearedText)
+	clearedText = ClearText("<p>hello<a href=\"https://example.com/\">example.com</a></p>")
+	assert.Equal(t, expected, clearedText)
+
+	expected = "hello world"
+	clearedText = ClearText("<div> hello</div>\n<div>world</div>")
+	assert.Equal(t, expected, clearedText)
+}
+
+func TestFetchExcerpt(t *testing.T) {
+	var (
+		expected,
+		text string
+	)
+
+	// test english string
+	expected = "hello"
+	text = FetchExcerpt("<p>hello world</p>", "...", 5)
+	assert.Equal(t, expected, text)
+
+	// test mixed string
+	expected = "hello你好"
+	text = FetchExcerpt("<p>hello你好world</p>", "...", 7)
+	assert.Equal(t, expected, text)
+
+	// test mixed string with emoticon
+	expected = "hello你好😂"
+	text = FetchExcerpt("<p>hello你好😂world</p>", "...", 8)
+	assert.Equal(t, expected, text)
+}