__init__.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. from __future__ import with_statement
  2. import re
  3. import os
  4. import viterbi
  5. import jieba
  6. import sys
  7. import marshal
  8. default_encoding = sys.getfilesystemencoding()
  9. PROB_START_P = "prob_start.p"
  10. PROB_TRANS_P = "prob_trans.p"
  11. PROB_EMIT_P = "prob_emit.p"
  12. CHAR_STATE_TAB_P = "char_state_tab.p"
  13. def load_model(f_name,isJython=True):
  14. _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
  15. result = {}
  16. with file(f_name, "rb") as f:
  17. for line in open(f_name,"rb"):
  18. line = line.strip()
  19. if line=="":continue
  20. word, _, tag = line.split(' ')
  21. result[word.decode('utf-8')]=tag
  22. f.closed
  23. if not isJython:
  24. return result
  25. start_p = {}
  26. abs_path = os.path.join(_curpath, PROB_START_P)
  27. with open(abs_path, mode='rb') as f:
  28. start_p = marshal.load(f)
  29. f.closed
  30. trans_p = {}
  31. abs_path = os.path.join(_curpath, PROB_TRANS_P)
  32. with open(abs_path, 'rb') as f:
  33. trans_p = marshal.load(f)
  34. f.closed
  35. emit_p = {}
  36. abs_path = os.path.join(_curpath, PROB_EMIT_P)
  37. with file(abs_path, 'rb') as f:
  38. emit_p = marshal.load(f)
  39. f.closed
  40. state = {}
  41. abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P)
  42. with file(abs_path, 'rb') as f:
  43. state = marshal.load(f)
  44. f.closed
  45. return state, start_p, trans_p, emit_p, result
  46. if sys.platform.startswith("java"):
  47. char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model(jieba.get_abs_path_dict())
  48. else:
  49. import char_state_tab, prob_start, prob_trans, prob_emit
  50. char_state_tab_P, start_P, trans_P, emit_P = char_state_tab.P, prob_start.P, prob_trans.P, prob_emit.P
  51. word_tag_tab = load_model(jieba.get_abs_path_dict(),isJython=False)
  52. if jieba.user_word_tag_tab:
  53. word_tag_tab.update(jieba.user_word_tag_tab)
  54. class pair(object):
  55. def __init__(self,word,flag):
  56. self.word = word
  57. self.flag = flag
  58. def __unicode__(self):
  59. return self.word+u"/"+self.flag
  60. def __repr__(self):
  61. return self.__str__()
  62. def __str__(self):
  63. return self.__unicode__().encode(default_encoding)
  64. def encode(self,arg):
  65. return self.__unicode__().encode(arg)
  66. def __cut(sentence):
  67. prob, pos_list = viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
  68. begin, next = 0,0
  69. for i,char in enumerate(sentence):
  70. pos = pos_list[i][0]
  71. if pos=='B':
  72. begin = i
  73. elif pos=='E':
  74. yield pair(sentence[begin:i+1], pos_list[i][1])
  75. next = i+1
  76. elif pos=='S':
  77. yield pair(char,pos_list[i][1])
  78. next = i+1
  79. if next<len(sentence):
  80. yield pair(sentence[next:], pos_list[next][1] )
  81. def __cut_detail(sentence):
  82. re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"([\.0-9]+|[a-zA-Z0-9]+)")
  83. re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
  84. blocks = re_han.split(sentence)
  85. for blk in blocks:
  86. if re_han.match(blk):
  87. for word in __cut(blk):
  88. yield word
  89. else:
  90. tmp = re_skip.split(blk)
  91. for x in tmp:
  92. if x!="":
  93. if re_num.match(x):
  94. yield pair(x,'m')
  95. elif re_eng.match(x):
  96. yield pair(x,'eng')
  97. else:
  98. yield pair(x,'x')
  99. def __cut_DAG(sentence):
  100. DAG = jieba.get_DAG(sentence)
  101. route ={}
  102. jieba.calc(sentence,DAG,0,route=route)
  103. x = 0
  104. buf =u''
  105. N = len(sentence)
  106. while x<N:
  107. y = route[x][1]+1
  108. l_word = sentence[x:y]
  109. if y-x==1:
  110. buf+= l_word
  111. else:
  112. if len(buf)>0:
  113. if len(buf)==1:
  114. yield pair(buf,word_tag_tab.get(buf,'x'))
  115. buf=u''
  116. else:
  117. if not (buf in jieba.FREQ):
  118. regognized = __cut_detail(buf)
  119. for t in regognized:
  120. yield t
  121. else:
  122. for elem in buf:
  123. yield pair(elem,word_tag_tab.get(elem,'x'))
  124. buf=u''
  125. yield pair(l_word,word_tag_tab.get(l_word,'x'))
  126. x =y
  127. if len(buf)>0:
  128. if len(buf)==1:
  129. yield pair(buf,word_tag_tab.get(buf,'x'))
  130. else:
  131. if not (buf in jieba.FREQ):
  132. regognized = __cut_detail(buf)
  133. for t in regognized:
  134. yield t
  135. else:
  136. for elem in buf:
  137. yield pair(elem,word_tag_tab.get(elem,'x'))
  138. def __cut_internal(sentence):
  139. if not ( type(sentence) is unicode):
  140. try:
  141. sentence = sentence.decode('utf-8')
  142. except:
  143. sentence = sentence.decode('gbk','ignore')
  144. re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)"), re.compile(ur"(\r\n|\s)")
  145. re_eng,re_num = re.compile(ur"[a-zA-Z0-9]+"), re.compile(ur"[\.0-9]+")
  146. blocks = re_han.split(sentence)
  147. for blk in blocks:
  148. if re_han.match(blk):
  149. for word in __cut_DAG(blk):
  150. yield word
  151. else:
  152. tmp = re_skip.split(blk)
  153. for x in tmp:
  154. if re_skip.match(x):
  155. yield pair(x,'x')
  156. else:
  157. for xx in x:
  158. if re_num.match(xx):
  159. yield pair(xx,'m')
  160. elif re_eng.match(x):
  161. yield pair(xx,'eng')
  162. else:
  163. yield pair(xx,'x')
  164. def __lcut_internal(sentence):
  165. return list(__cut_internal(sentence))
  166. def cut(sentence):
  167. if (not hasattr(jieba,'pool')) or (jieba.pool==None):
  168. for w in __cut_internal(sentence):
  169. yield w
  170. else:
  171. parts = re.compile('([\r\n]+)').split(sentence)
  172. result = jieba.pool.map(__lcut_internal,parts)
  173. for r in result:
  174. for w in r:
  175. yield w