tokenizer.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. package zTokenizer
  2. import (
  3. "regexp"
  4. "sort"
  5. )
  6. type Enum_LineBreakMode uint8
  7. const (
  8. LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota
  9. LINE_BREAK_MODE_WIN
  10. LINE_BREAK_MODE_MAC
  11. LINE_BREAK_MODE_UNIX
  12. LINE_BREAK_MODE_NONE
  13. )
  14. type ZTokenizer struct {
  15. tokenlist []int
  16. tokenRegList map[int]tokenRegInst
  17. sourceData []byte
  18. conf ZTokenizerConf
  19. currentPosition int
  20. totalLength int
  21. lineno int
  22. linepos int
  23. lbmode Enum_LineBreakMode
  24. reg_lbr_r *regexp.Regexp
  25. reg_lbr_n *regexp.Regexp
  26. reg_lbr_rn *regexp.Regexp
  27. lbrlist []int
  28. }
  29. type ZTokenizerConf struct {
  30. SingleLineTokenRegexp map[int]string
  31. MultiLineTokenRegexp map[int]string
  32. SingleLineDecorTokenRegexp map[int]string
  33. MultiLineDecorTokenRegexp map[int]string
  34. LineBreakMode Enum_LineBreakMode
  35. UseLineBreakAsToken bool
  36. LineBreakTokenID int
  37. LexicalErrorCallback func(char byte, lineno int, linepos int)
  38. }
  39. type tokenRegInst struct {
  40. reg *regexp.Regexp
  41. multiline bool
  42. decor bool
  43. }
  44. func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer {
  45. zt := ZTokenizer{
  46. tokenRegList: make(map[int]tokenRegInst),
  47. conf: conf,
  48. }
  49. for k,v := range conf.SingleLineDecorTokenRegexp{
  50. zt.tokenRegList[k] = tokenRegInst{
  51. reg: regexp.MustCompile("^" + v),
  52. multiline: false,
  53. decor: true,
  54. }
  55. }
  56. for k,v := range conf.MultiLineDecorTokenRegexp{
  57. zt.tokenRegList[k] = tokenRegInst{
  58. reg: regexp.MustCompile("^" + v),
  59. multiline: true,
  60. decor: true,
  61. }
  62. }
  63. for k,v := range conf.SingleLineTokenRegexp{
  64. zt.tokenRegList[k] = tokenRegInst{
  65. reg: regexp.MustCompile("^" + v),
  66. multiline: false,
  67. decor: false,
  68. }
  69. }
  70. for k,v := range conf.MultiLineTokenRegexp{
  71. zt.tokenRegList[k] = tokenRegInst{
  72. reg: regexp.MustCompile("^" + v),
  73. multiline: true,
  74. decor: false,
  75. }
  76. }
  77. zt.tokenlist = make([]int, len(zt.tokenRegList))
  78. i := 0
  79. for k := range zt.tokenRegList {
  80. zt.tokenlist[i] = k
  81. i++
  82. }
  83. sort.Ints(zt.tokenlist)
  84. zt.reg_lbr_rn = regexp.MustCompile("\r\n")
  85. zt.reg_lbr_r = regexp.MustCompile("\r")
  86. zt.reg_lbr_n = regexp.MustCompile("\n")
  87. return &zt
  88. }
  89. func (this *ZTokenizer)Input(source []byte) {
  90. this.sourceData = source
  91. this.currentPosition = 0
  92. this.totalLength = len(source)
  93. this.lineno = 1
  94. this.linepos = 1
  95. if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO {
  96. this.lbmode = this.checkLineBreakMode()
  97. }else {
  98. this.lbmode = this.conf.LineBreakMode
  99. }
  100. }
  101. func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) {
  102. charloop:
  103. for this.currentPosition < this.totalLength{
  104. flag := false
  105. switch this.lbmode {
  106. case LINE_BREAK_MODE_NONE:
  107. break
  108. case LINE_BREAK_MODE_UNIX:
  109. if this.sourceData[this.currentPosition] == '\n'{
  110. oln := this.lineno
  111. olp := this.linepos
  112. this.currentPosition++
  113. this.lineno++
  114. this.linepos = 1
  115. flag = true
  116. if this.conf.UseLineBreakAsToken {
  117. return this.conf.LineBreakTokenID, []byte("\n"), oln, olp
  118. }
  119. continue charloop
  120. }
  121. break
  122. case LINE_BREAK_MODE_MAC:
  123. if this.sourceData[this.currentPosition] == '\r'{
  124. oln := this.lineno
  125. olp := this.linepos
  126. this.currentPosition++
  127. this.lineno++
  128. this.linepos = 1
  129. flag = true
  130. if this.conf.UseLineBreakAsToken {
  131. return this.conf.LineBreakTokenID, []byte("\r"), oln, olp
  132. }
  133. continue charloop
  134. }
  135. break
  136. case LINE_BREAK_MODE_WIN:
  137. if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{
  138. oln := this.lineno
  139. olp := this.linepos
  140. this.currentPosition += 2
  141. this.lineno++
  142. this.linepos = 1
  143. flag = true
  144. if this.conf.UseLineBreakAsToken {
  145. return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp
  146. }
  147. continue charloop
  148. }
  149. break
  150. }
  151. regloop:
  152. for _,tokentype := range this.tokenlist {
  153. reginst := this.tokenRegList[tokentype]
  154. m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:])
  155. if len(m) == 2 {
  156. if m[1] > 0 {
  157. st := this.currentPosition
  158. ed := this.currentPosition + m[1]
  159. oln := this.lineno
  160. olp := this.linepos
  161. this.currentPosition = ed
  162. if reginst.multiline {
  163. lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed])
  164. if lno > 0 {
  165. this.lineno += lno
  166. this.linepos = lpo + 1
  167. }else {
  168. this.linepos += lpo
  169. }
  170. }else {
  171. this.linepos += m[1]
  172. }
  173. if !reginst.decor {
  174. return tokentype, this.sourceData[st:ed], oln, olp
  175. }
  176. flag = true
  177. break regloop
  178. }
  179. }
  180. }
  181. if !flag{
  182. this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos)
  183. return 0, []byte{},this.lineno, this.linepos
  184. }
  185. }
  186. return 0, []byte{},this.lineno, this.linepos
  187. }
  188. func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode {
  189. if this.reg_lbr_rn.Match(this.sourceData){
  190. return LINE_BREAK_MODE_WIN
  191. }
  192. if this.reg_lbr_r.Match(this.sourceData){
  193. return LINE_BREAK_MODE_MAC
  194. }
  195. if this.reg_lbr_n.Match(this.sourceData){
  196. return LINE_BREAK_MODE_UNIX
  197. }
  198. return LINE_BREAK_MODE_NONE
  199. }
  200. func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) {
  201. cpi := 0
  202. dlen := len(data)
  203. lcnt := 0
  204. po := 0
  205. var reg *regexp.Regexp
  206. switch this.lbmode {
  207. case LINE_BREAK_MODE_NONE:
  208. return 0, dlen
  209. case LINE_BREAK_MODE_WIN:
  210. reg = this.reg_lbr_rn
  211. break
  212. case LINE_BREAK_MODE_MAC:
  213. reg = this.reg_lbr_r
  214. break
  215. case LINE_BREAK_MODE_UNIX:
  216. reg = this.reg_lbr_n
  217. break
  218. }
  219. for cpi < dlen {
  220. m := reg.FindIndex(data[cpi:])
  221. if len(m) == 2 {
  222. if m[1] > 0 {
  223. lcnt++
  224. cpi += m[1]
  225. po = cpi
  226. }else {
  227. break
  228. }
  229. }else {
  230. break
  231. }
  232. }
  233. return lcnt, dlen - po
  234. }