package zTokenizer import ( "regexp" "sort" ) type Enum_LineBreakMode uint8 const ( LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota LINE_BREAK_MODE_WIN LINE_BREAK_MODE_MAC LINE_BREAK_MODE_UNIX LINE_BREAK_MODE_NONE ) type ZTokenizer struct { tokenlist []int tokenRegList map[int]tokenRegInst sourceData []byte conf ZTokenizerConf currentPosition int totalLength int lineno int linepos int lbmode Enum_LineBreakMode reg_lbr_r *regexp.Regexp reg_lbr_n *regexp.Regexp reg_lbr_rn *regexp.Regexp lbrlist []int } type ZTokenizerConf struct { SingleLineTokenRegexp map[int]string MultiLineTokenRegexp map[int]string SingleLineDecorTokenRegexp map[int]string MultiLineDecorTokenRegexp map[int]string LineBreakMode Enum_LineBreakMode UseLineBreakAsToken bool LineBreakTokenID int LexicalErrorCallback func(char byte, lineno int, linepos int) } type tokenRegInst struct { reg *regexp.Regexp multiline bool decor bool } func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer { zt := ZTokenizer{ tokenRegList: make(map[int]tokenRegInst), conf: conf, } for k,v := range conf.SingleLineDecorTokenRegexp{ zt.tokenRegList[k] = tokenRegInst{ reg: regexp.MustCompile("^" + v), multiline: false, decor: true, } } for k,v := range conf.MultiLineDecorTokenRegexp{ zt.tokenRegList[k] = tokenRegInst{ reg: regexp.MustCompile("^" + v), multiline: true, decor: true, } } for k,v := range conf.SingleLineTokenRegexp{ zt.tokenRegList[k] = tokenRegInst{ reg: regexp.MustCompile("^" + v), multiline: false, decor: false, } } for k,v := range conf.MultiLineTokenRegexp{ zt.tokenRegList[k] = tokenRegInst{ reg: regexp.MustCompile("^" + v), multiline: true, decor: false, } } zt.tokenlist = make([]int, len(zt.tokenRegList)) i := 0 for k := range zt.tokenRegList { zt.tokenlist[i] = k i++ } sort.Ints(zt.tokenlist) zt.reg_lbr_rn = regexp.MustCompile("\r\n") zt.reg_lbr_r = regexp.MustCompile("\r") zt.reg_lbr_n = regexp.MustCompile("\n") return &zt } func (this *ZTokenizer)Input(source []byte) { this.sourceData = source this.currentPosition = 0 this.totalLength = len(source) this.lineno = 1 this.linepos = 1 if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO { this.lbmode = this.checkLineBreakMode() }else { this.lbmode = this.conf.LineBreakMode } } func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) { charloop: for this.currentPosition < this.totalLength{ flag := false switch this.lbmode { case LINE_BREAK_MODE_NONE: break case LINE_BREAK_MODE_UNIX: if this.sourceData[this.currentPosition] == '\n'{ oln := this.lineno olp := this.linepos this.currentPosition++ this.lineno++ this.linepos = 1 flag = true if this.conf.UseLineBreakAsToken { return this.conf.LineBreakTokenID, []byte("\n"), oln, olp } continue charloop } break case LINE_BREAK_MODE_MAC: if this.sourceData[this.currentPosition] == '\r'{ oln := this.lineno olp := this.linepos this.currentPosition++ this.lineno++ this.linepos = 1 flag = true if this.conf.UseLineBreakAsToken { return this.conf.LineBreakTokenID, []byte("\r"), oln, olp } continue charloop } break case LINE_BREAK_MODE_WIN: if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{ oln := this.lineno olp := this.linepos this.currentPosition += 2 this.lineno++ this.linepos = 1 flag = true if this.conf.UseLineBreakAsToken { return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp } continue charloop } break } regloop: for _,tokentype := range this.tokenlist { reginst := this.tokenRegList[tokentype] m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:]) if len(m) == 2 { if m[1] > 0 { st := this.currentPosition ed := this.currentPosition + m[1] oln := this.lineno olp := this.linepos this.currentPosition = ed if reginst.multiline { lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed]) if lno > 0 { this.lineno += lno this.linepos = lpo + 1 }else { this.linepos += lpo } }else { this.linepos += m[1] } if !reginst.decor { return tokentype, this.sourceData[st:ed], oln, olp } flag = true break regloop } } } if !flag{ this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos) return 0, []byte{},this.lineno, this.linepos } } return 0, []byte{},this.lineno, this.linepos } func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode { if this.reg_lbr_rn.Match(this.sourceData){ return LINE_BREAK_MODE_WIN } if this.reg_lbr_r.Match(this.sourceData){ return LINE_BREAK_MODE_MAC } if this.reg_lbr_n.Match(this.sourceData){ return LINE_BREAK_MODE_UNIX } return LINE_BREAK_MODE_NONE } func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) { cpi := 0 dlen := len(data) lcnt := 0 po := 0 var reg *regexp.Regexp switch this.lbmode { case LINE_BREAK_MODE_NONE: return 0, dlen case LINE_BREAK_MODE_WIN: reg = this.reg_lbr_rn break case LINE_BREAK_MODE_MAC: reg = this.reg_lbr_r break case LINE_BREAK_MODE_UNIX: reg = this.reg_lbr_n break } for cpi < dlen { m := reg.FindIndex(data[cpi:]) if len(m) == 2 { if m[1] > 0 { lcnt++ cpi += m[1] po = cpi }else { break } }else { break } } return lcnt, dlen - po }