Browse Source

Coded and Simply Tested (As ANSI-C Lexical Parser).

zry 6 years ago
parent
commit
1380b030d7
1 changed files with 235 additions and 0 deletions
  1. 235 0
      tokenizer.go

+ 235 - 0
tokenizer.go

@@ -0,0 +1,235 @@
+package zTokenizer
+
+import (
+	"regexp"
+)
+
+type Enum_LineBreakMode uint8
+
+const (
+	LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota
+	LINE_BREAK_MODE_WIN
+	LINE_BREAK_MODE_MAC
+	LINE_BREAK_MODE_UNIX
+	LINE_BREAK_MODE_NONE
+)
+
+type ZTokenizer struct {
+	tokenRegList    map[int]tokenRegInst
+	sourceData      []byte
+	conf            ZTokenizerConf
+	currentPosition int
+	totalLength     int
+	lineno			int
+	linepos			int
+	lbmode			Enum_LineBreakMode
+	reg_lbr_r		*regexp.Regexp
+	reg_lbr_n		*regexp.Regexp
+	reg_lbr_rn		*regexp.Regexp
+	lbrlist			[]int
+}
+
+type ZTokenizerConf struct {
+	SingleLineTokenRegexp map[int]string
+	MultiLineTokenRegexp map[int]string
+	SingleLineDecorTokenRegexp map[int]string
+	MultiLineDecorTokenRegexp map[int]string
+	LineBreakMode Enum_LineBreakMode
+	UseLineBreakAsToken bool
+	LineBreakTokenID int
+	LexicalErrorCallback func(char byte, lineno int, linepos int)
+}
+
+type tokenRegInst struct {
+	reg *regexp.Regexp
+	multiline bool
+	decor bool
+}
+
+func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer {
+	zt := ZTokenizer{
+		tokenRegList: make(map[int]tokenRegInst),
+		conf:         conf,
+	}
+	for k,v := range conf.SingleLineDecorTokenRegexp{
+		zt.tokenRegList[k] = tokenRegInst{
+			reg: regexp.MustCompile("^" + v),
+			multiline: false,
+			decor: true,
+		}
+	}
+	for k,v := range conf.MultiLineDecorTokenRegexp{
+		zt.tokenRegList[k] = tokenRegInst{
+			reg: regexp.MustCompile("^" + v),
+			multiline: true,
+			decor: true,
+		}
+	}
+	for k,v := range conf.SingleLineTokenRegexp{
+		zt.tokenRegList[k] = tokenRegInst{
+			reg: regexp.MustCompile("^" + v),
+			multiline: false,
+			decor: false,
+		}
+	}
+	for k,v := range conf.MultiLineTokenRegexp{
+		zt.tokenRegList[k] = tokenRegInst{
+			reg: regexp.MustCompile("^" + v),
+			multiline: true,
+			decor: false,
+		}
+	}
+	zt.reg_lbr_rn = regexp.MustCompile("\r\n")
+	zt.reg_lbr_r = regexp.MustCompile("\r")
+	zt.reg_lbr_n = regexp.MustCompile("\n")
+	return &zt
+}
+
+func (this *ZTokenizer)Input(source []byte) {
+	this.sourceData = source
+	this.currentPosition = 0
+	this.totalLength = len(source)
+	this.lineno = 1
+	this.linepos = 1
+	if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO {
+		this.lbmode = this.checkLineBreakMode()
+	}else {
+		this.lbmode = this.conf.LineBreakMode
+	}
+}
+
+func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) {
+	charloop:
+	for this.currentPosition < this.totalLength{
+		flag := false
+		switch this.lbmode {
+		case LINE_BREAK_MODE_NONE:
+			break
+		case LINE_BREAK_MODE_UNIX:
+			if this.sourceData[this.currentPosition] == '\n'{
+				oln := this.lineno
+				olp := this.linepos
+				this.currentPosition++
+				this.lineno++
+				this.linepos = 1
+				flag = true
+				if this.conf.UseLineBreakAsToken {
+					return this.conf.LineBreakTokenID, []byte("\n"), oln, olp
+				}
+				continue charloop
+			}
+			break
+		case LINE_BREAK_MODE_MAC:
+			if this.sourceData[this.currentPosition] == '\r'{
+				oln := this.lineno
+				olp := this.linepos
+				this.currentPosition++
+				this.lineno++
+				this.linepos = 1
+				flag = true
+				if this.conf.UseLineBreakAsToken {
+					return this.conf.LineBreakTokenID, []byte("\r"), oln, olp
+				}
+				continue charloop
+			}
+			break
+		case LINE_BREAK_MODE_WIN:
+			if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{
+				oln := this.lineno
+				olp := this.linepos
+				this.currentPosition += 2
+				this.lineno++
+				this.linepos = 1
+				flag = true
+				if this.conf.UseLineBreakAsToken {
+					return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp
+				}
+				continue charloop
+			}
+			break
+		}
+		regloop:
+		for tokentype, reginst := range this.tokenRegList {
+			m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:])
+			if len(m) == 2 {
+				if m[1] > 0 {
+					st := this.currentPosition
+					ed := this.currentPosition + m[1]
+					oln := this.lineno
+					olp := this.linepos
+					this.currentPosition = ed
+					if reginst.multiline {
+						lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed])
+						if lno > 0 {
+							this.lineno += lno
+							this.linepos = lpo + 1
+						}else {
+							this.linepos += lpo
+						}
+					}else {
+						this.linepos += m[1]
+					}
+					if !reginst.decor {
+						return tokentype, this.sourceData[st:ed], oln, olp
+					}
+					flag = true
+					break regloop
+				}
+			}
+		}
+		if !flag{
+			this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos)
+			return 0, []byte{},this.lineno, this.linepos
+		}
+	}
+	return 0, []byte{},this.lineno, this.linepos
+}
+
+func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode {
+	if this.reg_lbr_rn.Match(this.sourceData){
+		return LINE_BREAK_MODE_WIN
+	}
+	if this.reg_lbr_r.Match(this.sourceData){
+		return LINE_BREAK_MODE_MAC
+	}
+	if this.reg_lbr_n.Match(this.sourceData){
+		return LINE_BREAK_MODE_UNIX
+	}
+	return LINE_BREAK_MODE_NONE
+}
+
+func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) {
+	cpi := 0
+	dlen := len(data)
+	lcnt := 0
+	po := 0
+	var reg *regexp.Regexp
+	switch this.lbmode {
+	case LINE_BREAK_MODE_NONE:
+		return 0, dlen
+	case LINE_BREAK_MODE_WIN:
+		reg = this.reg_lbr_rn
+		break
+	case LINE_BREAK_MODE_MAC:
+		reg = this.reg_lbr_r
+		break
+	case LINE_BREAK_MODE_UNIX:
+		reg = this.reg_lbr_n
+		break
+	}
+	for cpi < dlen {
+		m := reg.FindIndex(data[cpi:])
+		if len(m) == 2 {
+			if m[1] > 0 {
+				lcnt++
+				cpi += m[1]
+				po = cpi
+			}else {
+				break
+			}
+		}else {
+			break
+		}
+	}
+	return lcnt, dlen - po
+}