Merge pull request #162 from kongfei605/cn_decoding

support log decoding Chinese characters
This commit is contained in:
kongfei605 2022-08-19 19:29:47 +08:00 committed by GitHub
commit e5e8917bcc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 1 deletions

View File

@ -25,6 +25,16 @@ const (
UTF16BE string = "utf-16-be" UTF16BE string = "utf-16-be"
// UTF16LE for UTF-16 Little Endian encoding // UTF16LE for UTF-16 Little Endian encoding
UTF16LE string = "utf-16-le" UTF16LE string = "utf-16-le"
// https://en.wikipedia.org/wiki/GB_2312
// https://en.wikipedia.org/wiki/GBK_(character_encoding)
// https://en.wikipedia.org/wiki/GB_18030
// https://en.wikipedia.org/wiki/Big5
GB18030 string = "gb18030"
GB2312 string = "gb2312"
HZGB2312 string = "hz-gb2312"
GBK string = "gbk"
BIG5 string = "big5"
) )
// LogsConfig represents a log source config, which can be for instance // LogsConfig represents a log source config, which can be for instance

View File

@ -14,6 +14,7 @@ import (
"path/filepath" "path/filepath"
"regexp" "regexp"
"strconv" "strconv"
"strings"
"sync/atomic" "sync/atomic"
"time" "time"
@ -77,13 +78,26 @@ func NewDecoderFromSourceWithPattern(source *logsconfig.LogSource, multiLinePatt
// lineParser = docker.JSONParser // lineParser = docker.JSONParser
// matcher = &decoder.NewLineMatcher{} // matcher = &decoder.NewLineMatcher{}
default: default:
switch source.Config.Encoding { switch strings.ToLower(source.Config.Encoding) {
case logsconfig.UTF16BE: case logsconfig.UTF16BE:
lineParser = parser.NewDecodingParser(parser.UTF16BE) lineParser = parser.NewDecodingParser(parser.UTF16BE)
matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16beEOL) matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16beEOL)
case logsconfig.UTF16LE: case logsconfig.UTF16LE:
lineParser = parser.NewDecodingParser(parser.UTF16LE) lineParser = parser.NewDecodingParser(parser.UTF16LE)
matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16leEOL) matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16leEOL)
case logsconfig.GB18030:
lineParser = parser.NewDecodingParser(parser.GBK18030)
matcher = &decoder.NewLineMatcher{}
case logsconfig.HZGB2312:
lineParser = parser.NewDecodingParser(parser.HZGB2312)
matcher = &decoder.NewLineMatcher{}
case logsconfig.GBK, logsconfig.GB2312:
lineParser = parser.NewDecodingParser(parser.GBK)
matcher = &decoder.NewLineMatcher{}
case logsconfig.BIG5:
lineParser = parser.NewDecodingParser(parser.BIG5)
matcher = &decoder.NewLineMatcher{}
default: default:
lineParser = parser.NoopParser lineParser = parser.NoopParser
matcher = &decoder.NewLineMatcher{} matcher = &decoder.NewLineMatcher{}

View File

@ -7,6 +7,8 @@ package parser
import ( import (
"golang.org/x/text/encoding" "golang.org/x/text/encoding"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode" "golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform" "golang.org/x/text/transform"
) )
@ -22,6 +24,16 @@ const (
UTF16LE = iota UTF16LE = iota
// UTF16BE UTF16 big endian // UTF16BE UTF16 big endian
UTF16BE UTF16BE
//
GBK18030
//
GB2312
//
HZGB2312
//
GBK
//
BIG5
) )
// Parser parse messages // Parser parse messages
@ -67,6 +79,14 @@ func NewDecodingParser(e Encoding) *DecodingParser {
enc = unicode.UTF16(unicode.LittleEndian, unicode.UseBOM) enc = unicode.UTF16(unicode.LittleEndian, unicode.UseBOM)
case UTF16BE: case UTF16BE:
enc = unicode.UTF16(unicode.BigEndian, unicode.UseBOM) enc = unicode.UTF16(unicode.BigEndian, unicode.UseBOM)
case GBK, GB2312:
enc = simplifiedchinese.GBK
case HZGB2312:
enc = simplifiedchinese.HZGB2312
case GBK18030:
enc = simplifiedchinese.GB18030
case BIG5:
enc = traditionalchinese.Big5
} }
p.decoder = enc.NewDecoder() p.decoder = enc.NewDecoder()
return p return p