|
@@ -0,0 +1,235 @@
|
|
|
+package zTokenizer
|
|
|
+
|
|
|
+import (
|
|
|
+ "regexp"
|
|
|
+)
|
|
|
+
|
|
|
+type Enum_LineBreakMode uint8
|
|
|
+
|
|
|
+const (
|
|
|
+ LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota
|
|
|
+ LINE_BREAK_MODE_WIN
|
|
|
+ LINE_BREAK_MODE_MAC
|
|
|
+ LINE_BREAK_MODE_UNIX
|
|
|
+ LINE_BREAK_MODE_NONE
|
|
|
+)
|
|
|
+
|
|
|
+type ZTokenizer struct {
|
|
|
+ tokenRegList map[int]tokenRegInst
|
|
|
+ sourceData []byte
|
|
|
+ conf ZTokenizerConf
|
|
|
+ currentPosition int
|
|
|
+ totalLength int
|
|
|
+ lineno int
|
|
|
+ linepos int
|
|
|
+ lbmode Enum_LineBreakMode
|
|
|
+ reg_lbr_r *regexp.Regexp
|
|
|
+ reg_lbr_n *regexp.Regexp
|
|
|
+ reg_lbr_rn *regexp.Regexp
|
|
|
+ lbrlist []int
|
|
|
+}
|
|
|
+
|
|
|
+type ZTokenizerConf struct {
|
|
|
+ SingleLineTokenRegexp map[int]string
|
|
|
+ MultiLineTokenRegexp map[int]string
|
|
|
+ SingleLineDecorTokenRegexp map[int]string
|
|
|
+ MultiLineDecorTokenRegexp map[int]string
|
|
|
+ LineBreakMode Enum_LineBreakMode
|
|
|
+ UseLineBreakAsToken bool
|
|
|
+ LineBreakTokenID int
|
|
|
+ LexicalErrorCallback func(char byte, lineno int, linepos int)
|
|
|
+}
|
|
|
+
|
|
|
+type tokenRegInst struct {
|
|
|
+ reg *regexp.Regexp
|
|
|
+ multiline bool
|
|
|
+ decor bool
|
|
|
+}
|
|
|
+
|
|
|
+func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer {
|
|
|
+ zt := ZTokenizer{
|
|
|
+ tokenRegList: make(map[int]tokenRegInst),
|
|
|
+ conf: conf,
|
|
|
+ }
|
|
|
+ for k,v := range conf.SingleLineDecorTokenRegexp{
|
|
|
+ zt.tokenRegList[k] = tokenRegInst{
|
|
|
+ reg: regexp.MustCompile("^" + v),
|
|
|
+ multiline: false,
|
|
|
+ decor: true,
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for k,v := range conf.MultiLineDecorTokenRegexp{
|
|
|
+ zt.tokenRegList[k] = tokenRegInst{
|
|
|
+ reg: regexp.MustCompile("^" + v),
|
|
|
+ multiline: true,
|
|
|
+ decor: true,
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for k,v := range conf.SingleLineTokenRegexp{
|
|
|
+ zt.tokenRegList[k] = tokenRegInst{
|
|
|
+ reg: regexp.MustCompile("^" + v),
|
|
|
+ multiline: false,
|
|
|
+ decor: false,
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for k,v := range conf.MultiLineTokenRegexp{
|
|
|
+ zt.tokenRegList[k] = tokenRegInst{
|
|
|
+ reg: regexp.MustCompile("^" + v),
|
|
|
+ multiline: true,
|
|
|
+ decor: false,
|
|
|
+ }
|
|
|
+ }
|
|
|
+ zt.reg_lbr_rn = regexp.MustCompile("\r\n")
|
|
|
+ zt.reg_lbr_r = regexp.MustCompile("\r")
|
|
|
+ zt.reg_lbr_n = regexp.MustCompile("\n")
|
|
|
+ return &zt
|
|
|
+}
|
|
|
+
|
|
|
+func (this *ZTokenizer)Input(source []byte) {
|
|
|
+ this.sourceData = source
|
|
|
+ this.currentPosition = 0
|
|
|
+ this.totalLength = len(source)
|
|
|
+ this.lineno = 1
|
|
|
+ this.linepos = 1
|
|
|
+ if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO {
|
|
|
+ this.lbmode = this.checkLineBreakMode()
|
|
|
+ }else {
|
|
|
+ this.lbmode = this.conf.LineBreakMode
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) {
|
|
|
+ charloop:
|
|
|
+ for this.currentPosition < this.totalLength{
|
|
|
+ flag := false
|
|
|
+ switch this.lbmode {
|
|
|
+ case LINE_BREAK_MODE_NONE:
|
|
|
+ break
|
|
|
+ case LINE_BREAK_MODE_UNIX:
|
|
|
+ if this.sourceData[this.currentPosition] == '\n'{
|
|
|
+ oln := this.lineno
|
|
|
+ olp := this.linepos
|
|
|
+ this.currentPosition++
|
|
|
+ this.lineno++
|
|
|
+ this.linepos = 1
|
|
|
+ flag = true
|
|
|
+ if this.conf.UseLineBreakAsToken {
|
|
|
+ return this.conf.LineBreakTokenID, []byte("\n"), oln, olp
|
|
|
+ }
|
|
|
+ continue charloop
|
|
|
+ }
|
|
|
+ break
|
|
|
+ case LINE_BREAK_MODE_MAC:
|
|
|
+ if this.sourceData[this.currentPosition] == '\r'{
|
|
|
+ oln := this.lineno
|
|
|
+ olp := this.linepos
|
|
|
+ this.currentPosition++
|
|
|
+ this.lineno++
|
|
|
+ this.linepos = 1
|
|
|
+ flag = true
|
|
|
+ if this.conf.UseLineBreakAsToken {
|
|
|
+ return this.conf.LineBreakTokenID, []byte("\r"), oln, olp
|
|
|
+ }
|
|
|
+ continue charloop
|
|
|
+ }
|
|
|
+ break
|
|
|
+ case LINE_BREAK_MODE_WIN:
|
|
|
+ if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{
|
|
|
+ oln := this.lineno
|
|
|
+ olp := this.linepos
|
|
|
+ this.currentPosition += 2
|
|
|
+ this.lineno++
|
|
|
+ this.linepos = 1
|
|
|
+ flag = true
|
|
|
+ if this.conf.UseLineBreakAsToken {
|
|
|
+ return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp
|
|
|
+ }
|
|
|
+ continue charloop
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ regloop:
|
|
|
+ for tokentype, reginst := range this.tokenRegList {
|
|
|
+ m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:])
|
|
|
+ if len(m) == 2 {
|
|
|
+ if m[1] > 0 {
|
|
|
+ st := this.currentPosition
|
|
|
+ ed := this.currentPosition + m[1]
|
|
|
+ oln := this.lineno
|
|
|
+ olp := this.linepos
|
|
|
+ this.currentPosition = ed
|
|
|
+ if reginst.multiline {
|
|
|
+ lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed])
|
|
|
+ if lno > 0 {
|
|
|
+ this.lineno += lno
|
|
|
+ this.linepos = lpo + 1
|
|
|
+ }else {
|
|
|
+ this.linepos += lpo
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ this.linepos += m[1]
|
|
|
+ }
|
|
|
+ if !reginst.decor {
|
|
|
+ return tokentype, this.sourceData[st:ed], oln, olp
|
|
|
+ }
|
|
|
+ flag = true
|
|
|
+ break regloop
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if !flag{
|
|
|
+ this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos)
|
|
|
+ return 0, []byte{},this.lineno, this.linepos
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return 0, []byte{},this.lineno, this.linepos
|
|
|
+}
|
|
|
+
|
|
|
+func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode {
|
|
|
+ if this.reg_lbr_rn.Match(this.sourceData){
|
|
|
+ return LINE_BREAK_MODE_WIN
|
|
|
+ }
|
|
|
+ if this.reg_lbr_r.Match(this.sourceData){
|
|
|
+ return LINE_BREAK_MODE_MAC
|
|
|
+ }
|
|
|
+ if this.reg_lbr_n.Match(this.sourceData){
|
|
|
+ return LINE_BREAK_MODE_UNIX
|
|
|
+ }
|
|
|
+ return LINE_BREAK_MODE_NONE
|
|
|
+}
|
|
|
+
|
|
|
+func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) {
|
|
|
+ cpi := 0
|
|
|
+ dlen := len(data)
|
|
|
+ lcnt := 0
|
|
|
+ po := 0
|
|
|
+ var reg *regexp.Regexp
|
|
|
+ switch this.lbmode {
|
|
|
+ case LINE_BREAK_MODE_NONE:
|
|
|
+ return 0, dlen
|
|
|
+ case LINE_BREAK_MODE_WIN:
|
|
|
+ reg = this.reg_lbr_rn
|
|
|
+ break
|
|
|
+ case LINE_BREAK_MODE_MAC:
|
|
|
+ reg = this.reg_lbr_r
|
|
|
+ break
|
|
|
+ case LINE_BREAK_MODE_UNIX:
|
|
|
+ reg = this.reg_lbr_n
|
|
|
+ break
|
|
|
+ }
|
|
|
+ for cpi < dlen {
|
|
|
+ m := reg.FindIndex(data[cpi:])
|
|
|
+ if len(m) == 2 {
|
|
|
+ if m[1] > 0 {
|
|
|
+ lcnt++
|
|
|
+ cpi += m[1]
|
|
|
+ po = cpi
|
|
|
+ }else {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return lcnt, dlen - po
|
|
|
+}
|