6 years ago · 1380b030d7
--- a/tokenizer.go
+++ b/tokenizer.go
@@ -0,0 +1,235 @@
 
				+package zTokenizer
			
 
				+
			
 
				+import (
			
 
				+	"regexp"
			
 
				+)
			
 
				+
			
 
				+type Enum_LineBreakMode uint8
			
 
				+
			
 
				+const (
			
 
				+	LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota
			
 
				+	LINE_BREAK_MODE_WIN
			
 
				+	LINE_BREAK_MODE_MAC
			
 
				+	LINE_BREAK_MODE_UNIX
			
 
				+	LINE_BREAK_MODE_NONE
			
 
				+)
			
 
				+
			
 
				+type ZTokenizer struct {
			
 
				+	tokenRegList    map[int]tokenRegInst
			
 
				+	sourceData      []byte
			
 
				+	conf            ZTokenizerConf
			
 
				+	currentPosition int
			
 
				+	totalLength     int
			
 
				+	lineno			int
			
 
				+	linepos			int
			
 
				+	lbmode			Enum_LineBreakMode
			
 
				+	reg_lbr_r		*regexp.Regexp
			
 
				+	reg_lbr_n		*regexp.Regexp
			
 
				+	reg_lbr_rn		*regexp.Regexp
			
 
				+	lbrlist			[]int
			
 
				+}
			
 
				+
			
 
				+type ZTokenizerConf struct {
			
 
				+	SingleLineTokenRegexp map[int]string
			
 
				+	MultiLineTokenRegexp map[int]string
			
 
				+	SingleLineDecorTokenRegexp map[int]string
			
 
				+	MultiLineDecorTokenRegexp map[int]string
			
 
				+	LineBreakMode Enum_LineBreakMode
			
 
				+	UseLineBreakAsToken bool
			
 
				+	LineBreakTokenID int
			
 
				+	LexicalErrorCallback func(char byte, lineno int, linepos int)
			
 
				+}
			
 
				+
			
 
				+type tokenRegInst struct {
			
 
				+	reg *regexp.Regexp
			
 
				+	multiline bool
			
 
				+	decor bool
			
 
				+}
			
 
				+
			
 
				+func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer {
			
 
				+	zt := ZTokenizer{
			
 
				+		tokenRegList: make(map[int]tokenRegInst),
			
 
				+		conf:         conf,
			
 
				+	}
			
 
				+	for k,v := range conf.SingleLineDecorTokenRegexp{
			
 
				+		zt.tokenRegList[k] = tokenRegInst{
			
 
				+			reg: regexp.MustCompile("^" + v),
			
 
				+			multiline: false,
			
 
				+			decor: true,
			
 
				+		}
			
 
				+	}
			
 
				+	for k,v := range conf.MultiLineDecorTokenRegexp{
			
 
				+		zt.tokenRegList[k] = tokenRegInst{
			
 
				+			reg: regexp.MustCompile("^" + v),
			
 
				+			multiline: true,
			
 
				+			decor: true,
			
 
				+		}
			
 
				+	}
			
 
				+	for k,v := range conf.SingleLineTokenRegexp{
			
 
				+		zt.tokenRegList[k] = tokenRegInst{
			
 
				+			reg: regexp.MustCompile("^" + v),
			
 
				+			multiline: false,
			
 
				+			decor: false,
			
 
				+		}
			
 
				+	}
			
 
				+	for k,v := range conf.MultiLineTokenRegexp{
			
 
				+		zt.tokenRegList[k] = tokenRegInst{
			
 
				+			reg: regexp.MustCompile("^" + v),
			
 
				+			multiline: true,
			
 
				+			decor: false,
			
 
				+		}
			
 
				+	}
			
 
				+	zt.reg_lbr_rn = regexp.MustCompile("\r\n")
			
 
				+	zt.reg_lbr_r = regexp.MustCompile("\r")
			
 
				+	zt.reg_lbr_n = regexp.MustCompile("\n")
			
 
				+	return &zt
			
 
				+}
			
 
				+
			
 
				+func (this *ZTokenizer)Input(source []byte) {
			
 
				+	this.sourceData = source
			
 
				+	this.currentPosition = 0
			
 
				+	this.totalLength = len(source)
			
 
				+	this.lineno = 1
			
 
				+	this.linepos = 1
			
 
				+	if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO {
			
 
				+		this.lbmode = this.checkLineBreakMode()
			
 
				+	}else {
			
 
				+		this.lbmode = this.conf.LineBreakMode
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) {
			
 
				+	charloop:
			
 
				+	for this.currentPosition < this.totalLength{
			
 
				+		flag := false
			
 
				+		switch this.lbmode {
			
 
				+		case LINE_BREAK_MODE_NONE:
			
 
				+			break
			
 
				+		case LINE_BREAK_MODE_UNIX:
			
 
				+			if this.sourceData[this.currentPosition] == '\n'{
			
 
				+				oln := this.lineno
			
 
				+				olp := this.linepos
			
 
				+				this.currentPosition++
			
 
				+				this.lineno++
			
 
				+				this.linepos = 1
			
 
				+				flag = true
			
 
				+				if this.conf.UseLineBreakAsToken {
			
 
				+					return this.conf.LineBreakTokenID, []byte("\n"), oln, olp
			
 
				+				}
			
 
				+				continue charloop
			
 
				+			}
			
 
				+			break
			
 
				+		case LINE_BREAK_MODE_MAC:
			
 
				+			if this.sourceData[this.currentPosition] == '\r'{
			
 
				+				oln := this.lineno
			
 
				+				olp := this.linepos
			
 
				+				this.currentPosition++
			
 
				+				this.lineno++
			
 
				+				this.linepos = 1
			
 
				+				flag = true
			
 
				+				if this.conf.UseLineBreakAsToken {
			
 
				+					return this.conf.LineBreakTokenID, []byte("\r"), oln, olp
			
 
				+				}
			
 
				+				continue charloop
			
 
				+			}
			
 
				+			break
			
 
				+		case LINE_BREAK_MODE_WIN:
			
 
				+			if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{
			
 
				+				oln := this.lineno
			
 
				+				olp := this.linepos
			
 
				+				this.currentPosition += 2
			
 
				+				this.lineno++
			
 
				+				this.linepos = 1
			
 
				+				flag = true
			
 
				+				if this.conf.UseLineBreakAsToken {
			
 
				+					return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp
			
 
				+				}
			
 
				+				continue charloop
			
 
				+			}
			
 
				+			break
			
 
				+		}
			
 
				+		regloop:
			
 
				+		for tokentype, reginst := range this.tokenRegList {
			
 
				+			m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:])
			
 
				+			if len(m) == 2 {
			
 
				+				if m[1] > 0 {
			
 
				+					st := this.currentPosition
			
 
				+					ed := this.currentPosition + m[1]
			
 
				+					oln := this.lineno
			
 
				+					olp := this.linepos
			
 
				+					this.currentPosition = ed
			
 
				+					if reginst.multiline {
			
 
				+						lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed])
			
 
				+						if lno > 0 {
			
 
				+							this.lineno += lno
			
 
				+							this.linepos = lpo + 1
			
 
				+						}else {
			
 
				+							this.linepos += lpo
			
 
				+						}
			
 
				+					}else {
			
 
				+						this.linepos += m[1]
			
 
				+					}
			
 
				+					if !reginst.decor {
			
 
				+						return tokentype, this.sourceData[st:ed], oln, olp
			
 
				+					}
			
 
				+					flag = true
			
 
				+					break regloop
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		if !flag{
			
 
				+			this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos)
			
 
				+			return 0, []byte{},this.lineno, this.linepos
			
 
				+		}
			
 
				+	}
			
 
				+	return 0, []byte{},this.lineno, this.linepos
			
 
				+}
			
 
				+
			
 
				+func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode {
			
 
				+	if this.reg_lbr_rn.Match(this.sourceData){
			
 
				+		return LINE_BREAK_MODE_WIN
			
 
				+	}
			
 
				+	if this.reg_lbr_r.Match(this.sourceData){
			
 
				+		return LINE_BREAK_MODE_MAC
			
 
				+	}
			
 
				+	if this.reg_lbr_n.Match(this.sourceData){
			
 
				+		return LINE_BREAK_MODE_UNIX
			
 
				+	}
			
 
				+	return LINE_BREAK_MODE_NONE
			
 
				+}
			
 
				+
			
 
				+func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) {
			
 
				+	cpi := 0
			
 
				+	dlen := len(data)
			
 
				+	lcnt := 0
			
 
				+	po := 0
			
 
				+	var reg *regexp.Regexp
			
 
				+	switch this.lbmode {
			
 
				+	case LINE_BREAK_MODE_NONE:
			
 
				+		return 0, dlen
			
 
				+	case LINE_BREAK_MODE_WIN:
			
 
				+		reg = this.reg_lbr_rn
			
 
				+		break
			
 
				+	case LINE_BREAK_MODE_MAC:
			
 
				+		reg = this.reg_lbr_r
			
 
				+		break
			
 
				+	case LINE_BREAK_MODE_UNIX:
			
 
				+		reg = this.reg_lbr_n
			
 
				+		break
			
 
				+	}
			
 
				+	for cpi < dlen {
			
 
				+		m := reg.FindIndex(data[cpi:])
			
 
				+		if len(m) == 2 {
			
 
				+			if m[1] > 0 {
			
 
				+				lcnt++
			
 
				+				cpi += m[1]
			
 
				+				po = cpi
			
 
				+			}else {
			
 
				+				break
			
 
				+			}
			
 
				+		}else {
			
 
				+			break
			
 
				+		}
			
 
				+	}
			
 
				+	return lcnt, dlen - po
			
 
				+}