Merge pull request #162 from kongfei605/cn_decoding
support log decoding Chinese characters
This commit is contained in:
commit
e5e8917bcc
|
@ -25,6 +25,16 @@ const (
|
||||||
UTF16BE string = "utf-16-be"
|
UTF16BE string = "utf-16-be"
|
||||||
// UTF16LE for UTF-16 Little Endian encoding
|
// UTF16LE for UTF-16 Little Endian encoding
|
||||||
UTF16LE string = "utf-16-le"
|
UTF16LE string = "utf-16-le"
|
||||||
|
|
||||||
|
// https://en.wikipedia.org/wiki/GB_2312
|
||||||
|
// https://en.wikipedia.org/wiki/GBK_(character_encoding)
|
||||||
|
// https://en.wikipedia.org/wiki/GB_18030
|
||||||
|
// https://en.wikipedia.org/wiki/Big5
|
||||||
|
GB18030 string = "gb18030"
|
||||||
|
GB2312 string = "gb2312"
|
||||||
|
HZGB2312 string = "hz-gb2312"
|
||||||
|
GBK string = "gbk"
|
||||||
|
BIG5 string = "big5"
|
||||||
)
|
)
|
||||||
|
|
||||||
// LogsConfig represents a log source config, which can be for instance
|
// LogsConfig represents a log source config, which can be for instance
|
||||||
|
|
|
@ -14,6 +14,7 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -77,13 +78,26 @@ func NewDecoderFromSourceWithPattern(source *logsconfig.LogSource, multiLinePatt
|
||||||
// lineParser = docker.JSONParser
|
// lineParser = docker.JSONParser
|
||||||
// matcher = &decoder.NewLineMatcher{}
|
// matcher = &decoder.NewLineMatcher{}
|
||||||
default:
|
default:
|
||||||
switch source.Config.Encoding {
|
switch strings.ToLower(source.Config.Encoding) {
|
||||||
case logsconfig.UTF16BE:
|
case logsconfig.UTF16BE:
|
||||||
lineParser = parser.NewDecodingParser(parser.UTF16BE)
|
lineParser = parser.NewDecodingParser(parser.UTF16BE)
|
||||||
matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16beEOL)
|
matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16beEOL)
|
||||||
case logsconfig.UTF16LE:
|
case logsconfig.UTF16LE:
|
||||||
lineParser = parser.NewDecodingParser(parser.UTF16LE)
|
lineParser = parser.NewDecodingParser(parser.UTF16LE)
|
||||||
matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16leEOL)
|
matcher = decoder.NewBytesSequenceMatcher(decoder.Utf16leEOL)
|
||||||
|
case logsconfig.GB18030:
|
||||||
|
lineParser = parser.NewDecodingParser(parser.GBK18030)
|
||||||
|
matcher = &decoder.NewLineMatcher{}
|
||||||
|
case logsconfig.HZGB2312:
|
||||||
|
lineParser = parser.NewDecodingParser(parser.HZGB2312)
|
||||||
|
matcher = &decoder.NewLineMatcher{}
|
||||||
|
case logsconfig.GBK, logsconfig.GB2312:
|
||||||
|
lineParser = parser.NewDecodingParser(parser.GBK)
|
||||||
|
matcher = &decoder.NewLineMatcher{}
|
||||||
|
case logsconfig.BIG5:
|
||||||
|
lineParser = parser.NewDecodingParser(parser.BIG5)
|
||||||
|
matcher = &decoder.NewLineMatcher{}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
lineParser = parser.NoopParser
|
lineParser = parser.NoopParser
|
||||||
matcher = &decoder.NewLineMatcher{}
|
matcher = &decoder.NewLineMatcher{}
|
||||||
|
|
|
@ -7,6 +7,8 @@ package parser
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"golang.org/x/text/encoding"
|
"golang.org/x/text/encoding"
|
||||||
|
"golang.org/x/text/encoding/simplifiedchinese"
|
||||||
|
"golang.org/x/text/encoding/traditionalchinese"
|
||||||
"golang.org/x/text/encoding/unicode"
|
"golang.org/x/text/encoding/unicode"
|
||||||
"golang.org/x/text/transform"
|
"golang.org/x/text/transform"
|
||||||
)
|
)
|
||||||
|
@ -22,6 +24,16 @@ const (
|
||||||
UTF16LE = iota
|
UTF16LE = iota
|
||||||
// UTF16BE UTF16 big endian
|
// UTF16BE UTF16 big endian
|
||||||
UTF16BE
|
UTF16BE
|
||||||
|
//
|
||||||
|
GBK18030
|
||||||
|
//
|
||||||
|
GB2312
|
||||||
|
//
|
||||||
|
HZGB2312
|
||||||
|
//
|
||||||
|
GBK
|
||||||
|
//
|
||||||
|
BIG5
|
||||||
)
|
)
|
||||||
|
|
||||||
// Parser parse messages
|
// Parser parse messages
|
||||||
|
@ -67,6 +79,14 @@ func NewDecodingParser(e Encoding) *DecodingParser {
|
||||||
enc = unicode.UTF16(unicode.LittleEndian, unicode.UseBOM)
|
enc = unicode.UTF16(unicode.LittleEndian, unicode.UseBOM)
|
||||||
case UTF16BE:
|
case UTF16BE:
|
||||||
enc = unicode.UTF16(unicode.BigEndian, unicode.UseBOM)
|
enc = unicode.UTF16(unicode.BigEndian, unicode.UseBOM)
|
||||||
|
case GBK, GB2312:
|
||||||
|
enc = simplifiedchinese.GBK
|
||||||
|
case HZGB2312:
|
||||||
|
enc = simplifiedchinese.HZGB2312
|
||||||
|
case GBK18030:
|
||||||
|
enc = simplifiedchinese.GB18030
|
||||||
|
case BIG5:
|
||||||
|
enc = traditionalchinese.Big5
|
||||||
}
|
}
|
||||||
p.decoder = enc.NewDecoder()
|
p.decoder = enc.NewDecoder()
|
||||||
return p
|
return p
|
||||||
|
|
Loading…
Reference in New Issue