tokenizer.go 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. package zTokenizer
  2. import (
  3. "regexp"
  4. )
  5. type Enum_LineBreakMode uint8
  6. const (
  7. LINE_BREAK_MODE_AUTO Enum_LineBreakMode = iota
  8. LINE_BREAK_MODE_WIN
  9. LINE_BREAK_MODE_MAC
  10. LINE_BREAK_MODE_UNIX
  11. LINE_BREAK_MODE_NONE
  12. )
  13. type ZTokenizer struct {
  14. tokenRegList map[int]tokenRegInst
  15. sourceData []byte
  16. conf ZTokenizerConf
  17. currentPosition int
  18. totalLength int
  19. lineno int
  20. linepos int
  21. lbmode Enum_LineBreakMode
  22. reg_lbr_r *regexp.Regexp
  23. reg_lbr_n *regexp.Regexp
  24. reg_lbr_rn *regexp.Regexp
  25. lbrlist []int
  26. }
  27. type ZTokenizerConf struct {
  28. SingleLineTokenRegexp map[int]string
  29. MultiLineTokenRegexp map[int]string
  30. SingleLineDecorTokenRegexp map[int]string
  31. MultiLineDecorTokenRegexp map[int]string
  32. LineBreakMode Enum_LineBreakMode
  33. UseLineBreakAsToken bool
  34. LineBreakTokenID int
  35. LexicalErrorCallback func(char byte, lineno int, linepos int)
  36. }
  37. type tokenRegInst struct {
  38. reg *regexp.Regexp
  39. multiline bool
  40. decor bool
  41. }
  42. func NewZTokenizer(conf ZTokenizerConf) *ZTokenizer {
  43. zt := ZTokenizer{
  44. tokenRegList: make(map[int]tokenRegInst),
  45. conf: conf,
  46. }
  47. for k,v := range conf.SingleLineDecorTokenRegexp{
  48. zt.tokenRegList[k] = tokenRegInst{
  49. reg: regexp.MustCompile("^" + v),
  50. multiline: false,
  51. decor: true,
  52. }
  53. }
  54. for k,v := range conf.MultiLineDecorTokenRegexp{
  55. zt.tokenRegList[k] = tokenRegInst{
  56. reg: regexp.MustCompile("^" + v),
  57. multiline: true,
  58. decor: true,
  59. }
  60. }
  61. for k,v := range conf.SingleLineTokenRegexp{
  62. zt.tokenRegList[k] = tokenRegInst{
  63. reg: regexp.MustCompile("^" + v),
  64. multiline: false,
  65. decor: false,
  66. }
  67. }
  68. for k,v := range conf.MultiLineTokenRegexp{
  69. zt.tokenRegList[k] = tokenRegInst{
  70. reg: regexp.MustCompile("^" + v),
  71. multiline: true,
  72. decor: false,
  73. }
  74. }
  75. zt.reg_lbr_rn = regexp.MustCompile("\r\n")
  76. zt.reg_lbr_r = regexp.MustCompile("\r")
  77. zt.reg_lbr_n = regexp.MustCompile("\n")
  78. return &zt
  79. }
  80. func (this *ZTokenizer)Input(source []byte) {
  81. this.sourceData = source
  82. this.currentPosition = 0
  83. this.totalLength = len(source)
  84. this.lineno = 1
  85. this.linepos = 1
  86. if this.conf.LineBreakMode == LINE_BREAK_MODE_AUTO {
  87. this.lbmode = this.checkLineBreakMode()
  88. }else {
  89. this.lbmode = this.conf.LineBreakMode
  90. }
  91. }
  92. func (this *ZTokenizer)GetLex() (token int, value []byte, lineno int, linepos int) {
  93. charloop:
  94. for this.currentPosition < this.totalLength{
  95. flag := false
  96. switch this.lbmode {
  97. case LINE_BREAK_MODE_NONE:
  98. break
  99. case LINE_BREAK_MODE_UNIX:
  100. if this.sourceData[this.currentPosition] == '\n'{
  101. oln := this.lineno
  102. olp := this.linepos
  103. this.currentPosition++
  104. this.lineno++
  105. this.linepos = 1
  106. flag = true
  107. if this.conf.UseLineBreakAsToken {
  108. return this.conf.LineBreakTokenID, []byte("\n"), oln, olp
  109. }
  110. continue charloop
  111. }
  112. break
  113. case LINE_BREAK_MODE_MAC:
  114. if this.sourceData[this.currentPosition] == '\r'{
  115. oln := this.lineno
  116. olp := this.linepos
  117. this.currentPosition++
  118. this.lineno++
  119. this.linepos = 1
  120. flag = true
  121. if this.conf.UseLineBreakAsToken {
  122. return this.conf.LineBreakTokenID, []byte("\r"), oln, olp
  123. }
  124. continue charloop
  125. }
  126. break
  127. case LINE_BREAK_MODE_WIN:
  128. if this.sourceData[this.currentPosition] == '\r' && this.sourceData[this.currentPosition + 1] == '\n'{
  129. oln := this.lineno
  130. olp := this.linepos
  131. this.currentPosition += 2
  132. this.lineno++
  133. this.linepos = 1
  134. flag = true
  135. if this.conf.UseLineBreakAsToken {
  136. return this.conf.LineBreakTokenID, []byte("\r\n"), oln, olp
  137. }
  138. continue charloop
  139. }
  140. break
  141. }
  142. regloop:
  143. for tokentype, reginst := range this.tokenRegList {
  144. m := reginst.reg.FindIndex(this.sourceData[this.currentPosition:])
  145. if len(m) == 2 {
  146. if m[1] > 0 {
  147. st := this.currentPosition
  148. ed := this.currentPosition + m[1]
  149. oln := this.lineno
  150. olp := this.linepos
  151. this.currentPosition = ed
  152. if reginst.multiline {
  153. lno,lpo := this.calcMultilineTokenPosOffset(this.sourceData[st:ed])
  154. if lno > 0 {
  155. this.lineno += lno
  156. this.linepos = lpo + 1
  157. }else {
  158. this.linepos += lpo
  159. }
  160. }else {
  161. this.linepos += m[1]
  162. }
  163. if !reginst.decor {
  164. return tokentype, this.sourceData[st:ed], oln, olp
  165. }
  166. flag = true
  167. break regloop
  168. }
  169. }
  170. }
  171. if !flag{
  172. this.conf.LexicalErrorCallback(this.sourceData[this.currentPosition], this.lineno, this.linepos)
  173. return 0, []byte{},this.lineno, this.linepos
  174. }
  175. }
  176. return 0, []byte{},this.lineno, this.linepos
  177. }
  178. func (this *ZTokenizer)checkLineBreakMode() Enum_LineBreakMode {
  179. if this.reg_lbr_rn.Match(this.sourceData){
  180. return LINE_BREAK_MODE_WIN
  181. }
  182. if this.reg_lbr_r.Match(this.sourceData){
  183. return LINE_BREAK_MODE_MAC
  184. }
  185. if this.reg_lbr_n.Match(this.sourceData){
  186. return LINE_BREAK_MODE_UNIX
  187. }
  188. return LINE_BREAK_MODE_NONE
  189. }
  190. func (this *ZTokenizer)calcMultilineTokenPosOffset(data []byte) (lnoffset,lpoffset int) {
  191. cpi := 0
  192. dlen := len(data)
  193. lcnt := 0
  194. po := 0
  195. var reg *regexp.Regexp
  196. switch this.lbmode {
  197. case LINE_BREAK_MODE_NONE:
  198. return 0, dlen
  199. case LINE_BREAK_MODE_WIN:
  200. reg = this.reg_lbr_rn
  201. break
  202. case LINE_BREAK_MODE_MAC:
  203. reg = this.reg_lbr_r
  204. break
  205. case LINE_BREAK_MODE_UNIX:
  206. reg = this.reg_lbr_n
  207. break
  208. }
  209. for cpi < dlen {
  210. m := reg.FindIndex(data[cpi:])
  211. if len(m) == 2 {
  212. if m[1] > 0 {
  213. lcnt++
  214. cpi += m[1]
  215. po = cpi
  216. }else {
  217. break
  218. }
  219. }else {
  220. break
  221. }
  222. }
  223. return lcnt, dlen - po
  224. }