__init__.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import jieba
  2. import os
  3. try:
  4. from analyzer import ChineseAnalyzer
  5. except ImportError:
  6. pass
  7. _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
  8. f_name = os.path.join(_curpath,"idf.txt")
  9. content = open(f_name,'rb').read().decode('utf-8')
  10. idf_freq = {}
  11. lines = content.split('\n')
  12. for line in lines:
  13. word,freq = line.split(' ')
  14. idf_freq[word] = float(freq)
  15. median_idf = sorted(idf_freq.values())[len(idf_freq)/2]
  16. stop_words= set([
  17. "the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
  18. ])
  19. def extract_tags(sentence,topK=20):
  20. words = jieba.cut(sentence)
  21. freq = {}
  22. for w in words:
  23. if len(w.strip())<2: continue
  24. if w.lower() in stop_words: continue
  25. freq[w]=freq.get(w,0.0)+1.0
  26. total = sum(freq.values())
  27. freq = [(k,v/total) for k,v in freq.iteritems()]
  28. tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
  29. st_list = sorted(tf_idf_list,reverse=True)
  30. top_tuples= st_list[:topK]
  31. tags = [a[1] for a in top_tuples]
  32. return tags