zry
/
zTokenizer


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
							package zTokenizer

import (
	"regexp"
)

type Enum_LineBreakMode uint8

const (
	LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota
	LINE_BREAK_MODE_WIN
	LINE_BREAK_MODE_MAC
	LINE_BREAK_MODE_UNIX
	LINE_BREAK_MODE_NONE
)

type ZTokenizer struct {
	tokenRegList    map[int]tokenRegInst
	sourceData      []byte
	conf            ZTokenizerConf
	currentPosition int
	totalLength     int
	lineno			int
	linepos			int
	lbmode			Enum_LineBreakMode
	reg_lbr_r		*regexp.Regexp
	reg_lbr_n		*regexp.Regexp
	reg_lbr_rn		*regexp.Regexp
	lbrlist			[]int
}

type ZTokenizerConf struct {
	SingleLineTokenRegexp map[int]string
	MultiLineTokenRegexp map[int]string
	SingleLineDecorTokenRegexp map[int]string
	MultiLineDecorTokenRegexp map[int]string
	LineBreakMode Enum_LineBreakMode
	UseLineBreakAsToken bool
	LineBreakTokenID int
	LexicalErrorCallback func(char byte, lineno int, linepos int)
}

type tokenRegInst struct {
	reg *regexp.Regexp
	multiline bool
	decor bool
}

func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer {
	zt := ZTokenizer{
		tokenRegList: make(map[int]tokenRegInst),
		conf:         conf,
	}
	for k,v := range conf.SingleLineDecorTokenRegexp{
		zt.tokenRegList[k] = tokenRegInst{
			reg: regexp.MustCompile("^" + v),
			multiline: false,
			decor: true,
		}
	}
	for k,v := range conf.MultiLineDecorTokenRegexp{
		zt.tokenRegList[k] = tokenRegInst{
			reg: regexp.MustCompile("^" + v),
			multiline: true,
			decor: true,
		}
	}
	for k,v := range conf.SingleLineTokenRegexp{
		zt.tokenRegList[k] = tokenRegInst{
			reg: regexp.MustCompile("^" + v),
			multiline: false,
			decor: false,
		}
	}
	for k,v := range conf.MultiLineTokenRegexp{
		zt.tokenRegList[k] = tokenRegInst{
			reg: regexp.MustCompile("^" + v),
			multiline: true,
			decor: false,
		}
	}
	zt.reg_lbr_rn = regexp.MustCompile("\r\n")
	zt.reg_lbr_r = regexp.MustCompile("\r")
	zt.reg_lbr_n = regexp.MustCompile("\n")
	return &zt
}

func (this *ZTokenizer)Input(source []byte) {
	this.sourceData = source
	this.currentPosition = 0
	this.totalLength = len(source)
	this.lineno = 1
	this.linepos = 1
	if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO {
		this.lbmode = this.checkLineBreakMode()
	}else {
		this.lbmode = this.conf.LineBreakMode
	}
}

func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) {
	charloop:
	for this.currentPosition < this.totalLength{
		flag := false
		switch this.lbmode {
		case LINE_BREAK_MODE_NONE:
			break
		case LINE_BREAK_MODE_UNIX:
			if this.sourceData[this.currentPosition] == '\n'{
				oln := this.lineno
				olp := this.linepos
				this.currentPosition++
				this.lineno++
				this.linepos = 1
				flag = true
				if this.conf.UseLineBreakAsToken {
					return this.conf.LineBreakTokenID, []byte("\n"), oln, olp
				}
				continue charloop
			}
			break
		case LINE_BREAK_MODE_MAC:
			if this.sourceData[this.currentPosition] == '\r'{
				oln := this.lineno
				olp := this.linepos
				this.currentPosition++
				this.lineno++
				this.linepos = 1
				flag = true
				if this.conf.UseLineBreakAsToken {
					return this.conf.LineBreakTokenID, []byte("\r"), oln, olp
				}
				continue charloop
			}
			break
		case LINE_BREAK_MODE_WIN:
			if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{
				oln := this.lineno
				olp := this.linepos
				this.currentPosition += 2
				this.lineno++
				this.linepos = 1
				flag = true
				if this.conf.UseLineBreakAsToken {
					return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp
				}
				continue charloop
			}
			break
		}
		regloop:
		for tokentype, reginst := range this.tokenRegList {
			m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:])
			if len(m) == 2 {
				if m[1] > 0 {
					st := this.currentPosition
					ed := this.currentPosition + m[1]
					oln := this.lineno
					olp := this.linepos
					this.currentPosition = ed
					if reginst.multiline {
						lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed])
						if lno > 0 {
							this.lineno += lno
							this.linepos = lpo + 1
						}else {
							this.linepos += lpo
						}
					}else {
						this.linepos += m[1]
					}
					if !reginst.decor {
						return tokentype, this.sourceData[st:ed], oln, olp
					}
					flag = true
					break regloop
				}
			}
		}
		if !flag{
			this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos)
			return 0, []byte{},this.lineno, this.linepos
		}
	}
	return 0, []byte{},this.lineno, this.linepos
}

func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode {
	if this.reg_lbr_rn.Match(this.sourceData){
		return LINE_BREAK_MODE_WIN
	}
	if this.reg_lbr_r.Match(this.sourceData){
		return LINE_BREAK_MODE_MAC
	}
	if this.reg_lbr_n.Match(this.sourceData){
		return LINE_BREAK_MODE_UNIX
	}
	return LINE_BREAK_MODE_NONE
}

func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) {
	cpi := 0
	dlen := len(data)
	lcnt := 0
	po := 0
	var reg *regexp.Regexp
	switch this.lbmode {
	case LINE_BREAK_MODE_NONE:
		return 0, dlen
	case LINE_BREAK_MODE_WIN:
		reg = this.reg_lbr_rn
		break
	case LINE_BREAK_MODE_MAC:
		reg = this.reg_lbr_r
		break
	case LINE_BREAK_MODE_UNIX:
		reg = this.reg_lbr_n
		break
	}
	for cpi < dlen {
		m := reg.FindIndex(data[cpi:])
		if len(m) == 2 {
			if m[1] > 0 {
				lcnt++
				cpi += m[1]
				po = cpi
			}else {
				break
			}
		}else {
			break
		}
	}
	return lcnt, dlen - po
}