123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378 |
- from __future__ import with_statement
- __version__ = '0.31'
- __license__ = 'MIT'
- import re
- import os
- import sys
- import finalseg
- import time
- import tempfile
- import marshal
- from math import log
- import random
- import threading
- from functools import wraps
- DICTIONARY = "dict.txt"
- DICT_LOCK = threading.RLock()
- trie = None # to be initialized
- FREQ = {}
- min_freq = 0.0
- total =0.0
- user_word_tag_tab={}
- initialized = False
- def gen_trie(f_name):
- lfreq = {}
- trie = {}
- ltotal = 0.0
- with open(f_name, 'rb') as f:
- lineno = 0
- for line in f.read().rstrip().decode('utf-8').split('\n'):
- lineno += 1
- try:
- word,freq,_ = line.split(' ')
- freq = float(freq)
- lfreq[word] = freq
- ltotal+=freq
- p = trie
- for c in word:
- if not c in p:
- p[c] ={}
- p = p[c]
- p['']='' #ending flag
- except ValueError, e:
- print >> sys.stderr, f_name, ' at line', lineno, line
- raise e
- return trie, lfreq,ltotal
- def initialize(*args):
- global trie, FREQ, total, min_freq, initialized
- if len(args)==0:
- dictionary = DICTIONARY
- else:
- dictionary = args[0]
- with DICT_LOCK:
- if initialized:
- return
- if trie:
- del trie
- trie = None
- _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
- abs_path = os.path.join(_curpath,dictionary)
- print >> sys.stderr, "Building Trie..., from " + abs_path
- t1 = time.time()
- if abs_path == os.path.join(_curpath,"dict.txt"): #defautl dictionary
- cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
- else: #customer dictionary
- cache_file = os.path.join(tempfile.gettempdir(),"jieba.user."+str(hash(abs_path))+".cache")
- load_from_cache_fail = True
- if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(abs_path):
- print >> sys.stderr, "loading model from cache " + cache_file
- try:
- trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
- load_from_cache_fail = False
- except:
- load_from_cache_fail = True
- if load_from_cache_fail:
- trie,FREQ,total = gen_trie(abs_path)
- FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
- min_freq = min(FREQ.itervalues())
- print >> sys.stderr, "dumping model to file cache " + cache_file
- try:
- tmp_suffix = "."+str(random.random())
- with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
- marshal.dump((trie,FREQ,total,min_freq),temp_cache_file)
- if os.name=='nt':
- import shutil
- replace_file = shutil.move
- else:
- replace_file = os.rename
- replace_file(cache_file+tmp_suffix,cache_file)
- except:
- print >> sys.stderr, "dump cache file failed."
- import traceback
- print >> sys.stderr, traceback.format_exc()
- initialized = True
- print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
- print >> sys.stderr, "Trie has been built succesfully."
- def require_initialized(fn):
- global initialized,DICTIONARY
- @wraps(fn)
- def wrapped(*args, **kwargs):
- if initialized:
- return fn(*args, **kwargs)
- else:
- initialize(DICTIONARY)
- return fn(*args, **kwargs)
- return wrapped
- def __cut_all(sentence):
- dag = get_DAG(sentence)
- old_j = -1
- for k,L in dag.iteritems():
- if len(L)==1 and k>old_j:
- yield sentence[k:L[0]+1]
- old_j = L[0]
- else:
- for j in L:
- if j>k:
- yield sentence[k:j+1]
- old_j = j
- def calc(sentence,DAG,idx,route):
- N = len(sentence)
- route[N] = (0.0,'')
- for idx in xrange(N-1,-1,-1):
- candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) + route[x+1][0],x ) for x in DAG[idx] ]
- route[idx] = max(candidates)
- @require_initialized
- def get_DAG(sentence):
- N = len(sentence)
- i,j=0,0
- p = trie
- DAG = {}
- while i<N:
- c = sentence[j]
- if c in p:
- p = p[c]
- if '' in p:
- if not i in DAG:
- DAG[i]=[]
- DAG[i].append(j)
- j+=1
- if j>=N:
- i+=1
- j=i
- p=trie
- else:
- p = trie
- i+=1
- j=i
- for i in xrange(len(sentence)):
- if not i in DAG:
- DAG[i] =[i]
- return DAG
- def __cut_DAG(sentence):
- DAG = get_DAG(sentence)
- route ={}
- calc(sentence,DAG,0,route=route)
- x = 0
- buf =u''
- N = len(sentence)
- while x<N:
- y = route[x][1]+1
- l_word = sentence[x:y]
- if y-x==1:
- buf+= l_word
- else:
- if len(buf)>0:
- if len(buf)==1:
- yield buf
- buf=u''
- else:
- if not (buf in FREQ):
- regognized = finalseg.cut(buf)
- for t in regognized:
- yield t
- else:
- for elem in buf:
- yield elem
- buf=u''
- yield l_word
- x =y
- if len(buf)>0:
- if len(buf)==1:
- yield buf
- else:
- if not (buf in FREQ):
- regognized = finalseg.cut(buf)
- for t in regognized:
- yield t
- else:
- for elem in buf:
- yield elem
- def cut(sentence,cut_all=False):
- if not isinstance(sentence, unicode):
- try:
- sentence = sentence.decode('utf-8')
- except UnicodeDecodeError:
- sentence = sentence.decode('gbk','ignore')
- re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", re.U), re.compile(ur"(\r\n|\s)", re.U)
- if cut_all:
- re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5]+)", re.U), re.compile(ur"[^a-zA-Z0-9+#\n]", re.U)
- blocks = re_han.split(sentence)
- cut_block = __cut_DAG
- if cut_all:
- cut_block = __cut_all
- for blk in blocks:
- if re_han.match(blk):
- #pprint.pprint(__cut_DAG(blk))
- for word in cut_block(blk):
- yield word
- else:
- tmp = re_skip.split(blk)
- for x in tmp:
- if re_skip.match(x):
- yield x
- elif not cut_all:
- for xx in x:
- yield xx
- else:
- yield x
- def cut_for_search(sentence):
- words = cut(sentence)
- for w in words:
- if len(w)>2:
- for i in xrange(len(w)-1):
- gram2 = w[i:i+2]
- if gram2 in FREQ:
- yield gram2
- if len(w)>3:
- for i in xrange(len(w)-2):
- gram3 = w[i:i+3]
- if gram3 in FREQ:
- yield gram3
- yield w
- @require_initialized
- def load_userdict(f):
- global trie,total,FREQ
- if isinstance(f, (str, unicode)):
- f = open(f, 'rb')
- content = f.read().decode('utf-8')
- line_no = 0
- for line in content.split("\n"):
- line_no+=1
- if line.rstrip()=='': continue
- tup =line.split(" ")
- word,freq = tup[0],tup[1]
- if line_no==1:
- word = word.replace(u'\ufeff',u"") #remove bom flag if it exists
- if len(tup)==3:
- add_word(word, freq, tup[2])
- else:
- add_word(word, freq)
- def add_word(word, freq, tag=None):
- global FREQ, trie, total, user_word_tag_tab
- freq = float(freq)
- FREQ[word] = log(freq / total)
- if tag is not None:
- user_word_tag_tab[word] = tag.strip()
- p = trie
- for c in word:
- if not c in p:
- p[c] = {}
- p = p[c]
- p[''] = '' # ending flag
- __ref_cut = cut
- __ref_cut_for_search = cut_for_search
- def __lcut(sentence):
- return list(__ref_cut(sentence,False))
- def __lcut_all(sentence):
- return list(__ref_cut(sentence,True))
- def __lcut_for_search(sentence):
- return list(__ref_cut_for_search(sentence))
- @require_initialized
- def enable_parallel(processnum=None):
- global pool,cut,cut_for_search
- if os.name=='nt':
- raise Exception("jieba: parallel mode only supports posix system")
- if sys.version_info[0]==2 and sys.version_info[1]<6:
- raise Exception("jieba: the parallel feature needs Python version>2.5 ")
- from multiprocessing import Pool,cpu_count
- if processnum==None:
- processnum = cpu_count()
- pool = Pool(processnum)
- def pcut(sentence,cut_all=False):
- parts = re.compile('([\r\n]+)').split(sentence)
- if cut_all:
- result = pool.map(__lcut_all,parts)
- else:
- result = pool.map(__lcut,parts)
- for r in result:
- for w in r:
- yield w
- def pcut_for_search(sentence):
- parts = re.compile('([\r\n]+)').split(sentence)
- result = pool.map(__lcut_for_search,parts)
- for r in result:
- for w in r:
- yield w
- cut = pcut
- cut_for_search = pcut_for_search
- def disable_parallel():
- global pool,cut,cut_for_search
- if 'pool' in globals():
- pool.close()
- pool = None
- cut = __ref_cut
- cut_for_search = __ref_cut_for_search
- def set_dictionary(dictionary_path):
- global initialized, DICTIONARY
- with DICT_LOCK:
- abs_path = os.path.normpath( os.path.join( os.getcwd(), dictionary_path ) )
- if not os.path.exists(abs_path):
- raise Exception("jieba: path does not exists:" + abs_path)
- DICTIONARY = abs_path
- initialized = False
- def get_abs_path_dict():
- _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
- abs_path = os.path.join(_curpath,DICTIONARY)
- return abs_path
- def tokenize(unicode_sentence,mode="default"):
- #mode ("default" or "search")
- if not isinstance(unicode_sentence, unicode):
- raise Exception("jieba: the input parameter should unicode.")
- start = 0
- if mode=='default':
- for w in cut(unicode_sentence):
- width = len(w)
- yield (w,start,start+width)
- start+=width
- else:
- for w in cut(unicode_sentence):
- width = len(w)
- if len(w)>2:
- for i in xrange(len(w)-1):
- gram2 = w[i:i+2]
- if gram2 in FREQ:
- yield (gram2,start+i,start+i+2)
- if len(w)>3:
- for i in xrange(len(w)-2):
- gram3 = w[i:i+3]
- if gram3 in FREQ:
- yield (gram3,start+i,start+i+3)
- yield (w,start,start+width)
- start+=width
|