__init__.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. from __future__ import with_statement
  2. import re
  3. import os
  4. import marshal
  5. import sys
  6. MIN_FLOAT=-3.14e100
  7. PROB_START_P = "prob_start.p"
  8. PROB_TRANS_P = "prob_trans.p"
  9. PROB_EMIT_P = "prob_emit.p"
  10. PrevStatus = {
  11. 'B':('E','S'),
  12. 'M':('M','B'),
  13. 'S':('S','E'),
  14. 'E':('B','M')
  15. }
  16. def load_model():
  17. _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
  18. start_p = {}
  19. abs_path = os.path.join(_curpath, PROB_START_P)
  20. with open(abs_path, mode='rb') as f:
  21. start_p = marshal.load(f)
  22. f.closed
  23. trans_p = {}
  24. abs_path = os.path.join(_curpath, PROB_TRANS_P)
  25. with open(abs_path, 'rb') as f:
  26. trans_p = marshal.load(f)
  27. f.closed
  28. emit_p = {}
  29. abs_path = os.path.join(_curpath, PROB_EMIT_P)
  30. with file(abs_path, 'rb') as f:
  31. emit_p = marshal.load(f)
  32. f.closed
  33. return start_p, trans_p, emit_p
  34. if sys.platform.startswith("java"):
  35. start_P, trans_P, emit_P = load_model()
  36. else:
  37. import prob_start,prob_trans,prob_emit
  38. start_P, trans_P, emit_P = prob_start.P, prob_trans.P, prob_emit.P
  39. def viterbi(obs, states, start_p, trans_p, emit_p):
  40. V = [{}] #tabular
  41. path = {}
  42. for y in states: #init
  43. V[0][y] = start_p[y] + emit_p[y].get(obs[0],MIN_FLOAT)
  44. path[y] = [y]
  45. for t in range(1,len(obs)):
  46. V.append({})
  47. newpath = {}
  48. for y in states:
  49. em_p = emit_p[y].get(obs[t],MIN_FLOAT)
  50. (prob,state ) = max([(V[t-1][y0] + trans_p[y0].get(y,MIN_FLOAT) + em_p ,y0) for y0 in PrevStatus[y] ])
  51. V[t][y] =prob
  52. newpath[y] = path[state] + [y]
  53. path = newpath
  54. (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
  55. return (prob, path[state])
  56. def __cut(sentence):
  57. global emit_P
  58. prob, pos_list = viterbi(sentence,('B','M','E','S'), start_P, trans_P, emit_P)
  59. begin, next = 0,0
  60. #print pos_list, sentence
  61. for i,char in enumerate(sentence):
  62. pos = pos_list[i]
  63. if pos=='B':
  64. begin = i
  65. elif pos=='E':
  66. yield sentence[begin:i+1]
  67. next = i+1
  68. elif pos=='S':
  69. yield char
  70. next = i+1
  71. if next<len(sentence):
  72. yield sentence[next:]
  73. def cut(sentence):
  74. if not ( type(sentence) is unicode):
  75. try:
  76. sentence = sentence.decode('utf-8')
  77. except:
  78. sentence = sentence.decode('gbk','ignore')
  79. re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)"), re.compile(ur"(\d+\.\d+|[a-zA-Z0-9]+)")
  80. blocks = re_han.split(sentence)
  81. for blk in blocks:
  82. if re_han.match(blk):
  83. for word in __cut(blk):
  84. yield word
  85. else:
  86. tmp = re_skip.split(blk)
  87. for x in tmp:
  88. if x!="":
  89. yield x