DictGenerator.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. # -*- coding: UTF-8 -*-
  2. import jieba.posseg as pseg
  3. import json
  4. print u"欢迎使用GensoukyoBurstedRouter验证码系统~"
  5. print u"本工具用于词库生成,请将用于词库生成的文本放入Source.txt(例如找一篇小说之类的)"
  6. print u"然后在custom.json内设置需要从结果中排除或额外增加的动词和名词"
  7. print u"(友情提示:所有文件请均使用UTF-8编码!)"
  8. print u"完成这些之后,请回车,否则请Ctrl+C终止执行。"
  9. print u"按回车继续..."
  10. raw_input()
  11. print u"======================================"
  12. print u"载入词库来源文本..."
  13. ff=open('Source.txt','rb')
  14. ftxt=ff.read().decode('utf-8')
  15. ff.close()
  16. print u"载入用户自定义配置..."
  17. ff=open('custom.json','rb')
  18. f=ff.read()
  19. cstm = json.loads(f,encoding='utf-8')
  20. ff.close()
  21. print u"开始分词..."
  22. nt,vt,=[],[]
  23. jb=pseg.cut(ftxt)
  24. for i in jb:
  25. if i.flag in ['n','nr','ns','nd','nh','ni','nl','ns','nz']:
  26. nt.append(i.word.encode('utf-8'))
  27. if i.flag == "v":
  28. vt.append(i.word.encode('utf-8'))
  29. print u"分词完毕!"
  30. print u"转换用户配置数据..."
  31. ntcadd = set(cstm['nouns']['add'])
  32. ntcdel = set(cstm['nouns']['del'])
  33. vtcadd = set(cstm['verbs']['add'])
  34. vtcdel = set(cstm['verbs']['del'])
  35. print u"按照用户配置增删词条、去除重复..."
  36. nts = set(nt)
  37. ntu = nts | ntcadd
  38. nto = ntu - ntcdel
  39. ntl = list(nto)
  40. vts = set(vt)
  41. vtu = vts | vtcadd
  42. vto = vtu - vtcdel
  43. vtl = list(vto)
  44. print u"处理完毕!正在写出文件..."
  45. od = {u"nouns":ntl,u"verbs":vtl}
  46. # for i in jb:
  47. # ot.append(i.encode('utf-8'))
  48. oj=json.dumps(od,ensure_ascii=False,encoding='utf8')
  49. ojhr = json.dumps(od,ensure_ascii=False,indent=4,encoding='utf8')
  50. ff=open('captcha-dict.json','wb')
  51. ff.write(oj.encode('utf-8'))
  52. ff.close()
  53. ff=open('captcha-dict-human-read.json','wb')
  54. ff.write(ojhr.encode('utf-8'))
  55. ff.close()
  56. print u"恭喜!词库已生成成功~"