From: F W. <fri...@us...> - 2006-06-30 10:48:46
|
Update of /cvsroot/translate/src/translate/search In directory sc8-pr-cvs10.sourceforge.net:/tmp/cvs-serv2516/search Modified Files: match.py Log Message: Filter out usable units, convert to fast format, factor out (re)initialisation code Index: match.py =================================================================== RCS file: /cvsroot/translate/src/translate/search/match.py,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- match.py 10 Feb 2006 20:18:30 -0000 1.2 +++ match.py 30 Jun 2006 10:48:28 -0000 1.3 @@ -19,33 +19,56 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -"""Class to return likely matches from a given list of strings""" +"""Class to perform translation memory matching from a store of translation units""" import Levenshtein import heapq +from translate.storage import base + +def usable(unit): + """Returns whether this translation unit is usable for TM""" + #TODO: We might want to consider more attributes, such as approved, reviewed, etc. + if unit.source and unit.target and not unit.isfuzzy(): + return True + return False class matcher: """A class that will do matching and store configuration for the matching process""" - def __init__(self, max_candidates=15, min_similarity=75, comparer=None): + def __init__(self, store, max_candidates=10, min_similarity=75, comparer=None): """max_candidates is the maximum number of candidates that should be assembled, min_similarity is the minimum similarity that must be attained to be included in the result, comparer is an optional Comparer with similarity() function""" if comparer is None: comparer = Levenshtein.LevenshteinComparer() self.comparer = comparer + self.setparameters(max_candidates, min_similarity) + self.inittm(store) + + def inittm(self, store): + """Initialises the memory for later use. We use simple base units for + speedup.""" + self.candidates = [] + candidates = filter(usable, store.units) + for candidate in candidates: + simpleunit = base.TranslationUnit(candidate.source) + simpleunit.target = candidate.target + self.candidates.append(simpleunit) + + def setparameters(self, max_candidates=10, min_similarity=75): + """Sets the parameters without reinitialising the tm. If a parameter + is not specified, it is set to the default, not ignored""" self.MAX_CANDIDATES = max_candidates self.MIN_SIMILARITY = min_similarity - - def matches(self, text, candidates): + + def matches(self, text): """Returns a list of possible matches for text in candidates with the associated similarity. - candidates is a list of base.TranslationUnits Return value is a list containing tuples (score, original, translation).""" bestcandidates = [(0.0,"","")]*self.MAX_CANDIDATES heapq.heapify(bestcandidates) #We use self.MIN_SIMILARITY, but if we already know we have max_candidates #that are better, we can adjust min_similarity upwards for speedup min_similarity = self.MIN_SIMILARITY - for candidate in candidates: + for candidate in self.candidates: cmpstring = candidate.source targetstring = candidate.target similarity = self.comparer.similarity(text, cmpstring, min_similarity) |