123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245 |
- package zTokenizer
- import (
- "regexp"
- "sort"
- )
- type Enum_LineBreakMode uint8
- const (
- LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota
- LINE_BREAK_MODE_WIN
- LINE_BREAK_MODE_MAC
- LINE_BREAK_MODE_UNIX
- LINE_BREAK_MODE_NONE
- )
- type ZTokenizer struct {
- tokenlist []int
- tokenRegList map[int]tokenRegInst
- sourceData []byte
- conf ZTokenizerConf
- currentPosition int
- totalLength int
- lineno int
- linepos int
- lbmode Enum_LineBreakMode
- reg_lbr_r *regexp.Regexp
- reg_lbr_n *regexp.Regexp
- reg_lbr_rn *regexp.Regexp
- lbrlist []int
- }
- type ZTokenizerConf struct {
- SingleLineTokenRegexp map[int]string
- MultiLineTokenRegexp map[int]string
- SingleLineDecorTokenRegexp map[int]string
- MultiLineDecorTokenRegexp map[int]string
- LineBreakMode Enum_LineBreakMode
- UseLineBreakAsToken bool
- LineBreakTokenID int
- LexicalErrorCallback func(char byte, lineno int, linepos int)
- }
- type tokenRegInst struct {
- reg *regexp.Regexp
- multiline bool
- decor bool
- }
- func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer {
- zt := ZTokenizer{
- tokenRegList: make(map[int]tokenRegInst),
- conf: conf,
- }
- for k,v := range conf.SingleLineDecorTokenRegexp{
- zt.tokenRegList[k] = tokenRegInst{
- reg: regexp.MustCompile("^" + v),
- multiline: false,
- decor: true,
- }
- }
- for k,v := range conf.MultiLineDecorTokenRegexp{
- zt.tokenRegList[k] = tokenRegInst{
- reg: regexp.MustCompile("^" + v),
- multiline: true,
- decor: true,
- }
- }
- for k,v := range conf.SingleLineTokenRegexp{
- zt.tokenRegList[k] = tokenRegInst{
- reg: regexp.MustCompile("^" + v),
- multiline: false,
- decor: false,
- }
- }
- for k,v := range conf.MultiLineTokenRegexp{
- zt.tokenRegList[k] = tokenRegInst{
- reg: regexp.MustCompile("^" + v),
- multiline: true,
- decor: false,
- }
- }
- zt.tokenlist = make([]int, len(zt.tokenRegList))
- i := 0
- for k := range zt.tokenRegList {
- zt.tokenlist[i] = k
- i++
- }
- sort.Ints(zt.tokenlist)
- zt.reg_lbr_rn = regexp.MustCompile("\r\n")
- zt.reg_lbr_r = regexp.MustCompile("\r")
- zt.reg_lbr_n = regexp.MustCompile("\n")
- return &zt
- }
- func (this *ZTokenizer)Input(source []byte) {
- this.sourceData = source
- this.currentPosition = 0
- this.totalLength = len(source)
- this.lineno = 1
- this.linepos = 1
- if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO {
- this.lbmode = this.checkLineBreakMode()
- }else {
- this.lbmode = this.conf.LineBreakMode
- }
- }
- func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) {
- charloop:
- for this.currentPosition < this.totalLength{
- flag := false
- switch this.lbmode {
- case LINE_BREAK_MODE_NONE:
- break
- case LINE_BREAK_MODE_UNIX:
- if this.sourceData[this.currentPosition] == '\n'{
- oln := this.lineno
- olp := this.linepos
- this.currentPosition++
- this.lineno++
- this.linepos = 1
- flag = true
- if this.conf.UseLineBreakAsToken {
- return this.conf.LineBreakTokenID, []byte("\n"), oln, olp
- }
- continue charloop
- }
- break
- case LINE_BREAK_MODE_MAC:
- if this.sourceData[this.currentPosition] == '\r'{
- oln := this.lineno
- olp := this.linepos
- this.currentPosition++
- this.lineno++
- this.linepos = 1
- flag = true
- if this.conf.UseLineBreakAsToken {
- return this.conf.LineBreakTokenID, []byte("\r"), oln, olp
- }
- continue charloop
- }
- break
- case LINE_BREAK_MODE_WIN:
- if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{
- oln := this.lineno
- olp := this.linepos
- this.currentPosition += 2
- this.lineno++
- this.linepos = 1
- flag = true
- if this.conf.UseLineBreakAsToken {
- return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp
- }
- continue charloop
- }
- break
- }
- regloop:
- for _,tokentype := range this.tokenlist {
- reginst := this.tokenRegList[tokentype]
- m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:])
- if len(m) == 2 {
- if m[1] > 0 {
- st := this.currentPosition
- ed := this.currentPosition + m[1]
- oln := this.lineno
- olp := this.linepos
- this.currentPosition = ed
- if reginst.multiline {
- lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed])
- if lno > 0 {
- this.lineno += lno
- this.linepos = lpo + 1
- }else {
- this.linepos += lpo
- }
- }else {
- this.linepos += m[1]
- }
- if !reginst.decor {
- return tokentype, this.sourceData[st:ed], oln, olp
- }
- flag = true
- break regloop
- }
- }
- }
- if !flag{
- this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos)
- return 0, []byte{},this.lineno, this.linepos
- }
- }
- return 0, []byte{},this.lineno, this.linepos
- }
- func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode {
- if this.reg_lbr_rn.Match(this.sourceData){
- return LINE_BREAK_MODE_WIN
- }
- if this.reg_lbr_r.Match(this.sourceData){
- return LINE_BREAK_MODE_MAC
- }
- if this.reg_lbr_n.Match(this.sourceData){
- return LINE_BREAK_MODE_UNIX
- }
- return LINE_BREAK_MODE_NONE
- }
- func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) {
- cpi := 0
- dlen := len(data)
- lcnt := 0
- po := 0
- var reg *regexp.Regexp
- switch this.lbmode {
- case LINE_BREAK_MODE_NONE:
- return 0, dlen
- case LINE_BREAK_MODE_WIN:
- reg = this.reg_lbr_rn
- break
- case LINE_BREAK_MODE_MAC:
- reg = this.reg_lbr_r
- break
- case LINE_BREAK_MODE_UNIX:
- reg = this.reg_lbr_n
- break
- }
- for cpi < dlen {
- m := reg.FindIndex(data[cpi:])
- if len(m) == 2 {
- if m[1] > 0 {
- lcnt++
- cpi += m[1]
- po = cpi
- }else {
- break
- }
- }else {
- break
- }
- }
- return lcnt, dlen - po
- }
|