|
From: <fri...@us...> - 2008-12-23 15:16:19
|
Revision: 9627
http://zaf.svn.sourceforge.net/zaf/?rev=9627&view=rev
Author: friedelwolff
Date: 2008-12-23 15:16:09 +0000 (Tue, 23 Dec 2008)
Log Message:
-----------
Some helper classes to work more easily with the hunspell .aff file format
Added Paths:
-----------
trunk/dict/zu/hunspell/hunspell_format.py
Added: trunk/dict/zu/hunspell/hunspell_format.py
===================================================================
--- trunk/dict/zu/hunspell/hunspell_format.py (rev 0)
+++ trunk/dict/zu/hunspell/hunspell_format.py 2008-12-23 15:16:09 UTC (rev 9627)
@@ -0,0 +1,223 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright 2008 Friedel Wolff
+#
+# This file is part of the Zulu Hunspell spell checker.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""This module contains helper classes for making a Hunspell checker."""
+
+#TODO: use unicode and handle encoding properly
+#TODO: more hunspell file level options
+#TODO: extended hunspell affix classes
+#TODO: allow to optionally remove the morphological information
+
+class HunspellOptions(object):
+ """A complete affix file for hunspell"""
+
+ header = ""
+ """ The header at the top of the file (like copyright notice)"""
+ encoding = "utf-8"
+ """The encoding to use to write out the file"""
+ try_list = ""
+ """A string of characters sorted by frequency to use for replacements"""
+ groups = []
+ """A list of all the affix groups"""
+ replace_list = []
+ """A list of suggestions for replacement"""
+ circumfix = ""
+ """The flag name to mark circumfixes with"""
+ needaffix = ""
+ """The flag name to indicate affix dependency requirement"""
+ complexprefixes = False
+ """Whether to enable complex prefix support"""
+
+ def __init__(self):
+ self.header = ""
+ self.encoding = "utf-8"
+ self.try_list = ""
+ self.groups = []
+ self.replace_list = []
+ self.circumfix = ""
+ self.needaffix = ""
+ self.complexprefixes = False
+
+ def hunspell(self):
+ """Output the whole affix file for Hunspell"""
+ output = ["# " + self.header]
+ output.append("SET %s" % self.encoding)
+ if self.try_list:
+ output.append("TRY " + self.try_list)
+ if self.circumfix:
+ output.append("CIRCUMFIX %s" % self.circumfix)
+ if self.needaffix:
+ output.append("NEEDAFFIX %s" % self.needaffix)
+ if self.complexprefixes:
+ output.append("COMPLEXPREFIXES")
+ output.append("")
+
+ for group in self.groups:
+ output.append(group.hunspell(self))
+ output.append("")
+
+ output.append("REP %d" % len(self.replace_list))
+ for rep in self.replace_list:
+ output.append("REP %s %s" % (rep[0], rep[1].replace(' ', '_')))
+ return "\n".join(output)
+
+ def add_group(self, **kwargs):
+ """Adds a new group to the file"""
+ assert "flag" in kwargs
+ new_group = AffixGroup()
+ new_group.__dict__.update(kwargs)
+ self.groups.append(new_group)
+ return new_group
+
+
+class AffixGroup(object):
+ """An affix class containing multiple rules"""
+
+ suffix = True
+ """True if this is a suffix, False if this is a prefix"""
+ flag = ""
+ """The flag name"""
+ cross_product = True
+ """Whether to allow combining prefixes and suffixes"""
+ rules = []
+ """A list with all the affixes in this group"""
+
+ def __init__(self):
+ self.suffix = True
+ self.flag = ""
+ self.cross_product = True
+ self.rules = []
+
+ def hunspell(self, options):
+ """Output this affix class for the hunspell affix file"""
+ option = self.suffix and "SFX" or "PFX"
+ assert self.flag
+ rules = self.remove_duplicates()
+ cross_product = self.cross_product and "Y" or "N"
+ output = ["%s %s %s %d" % (option, self.flag, cross_product, len(rules))]
+ for rule in rules:
+ output.append(rule.hunspell(options))
+ return "\n".join(output)
+
+ def remove_duplicates(self):
+ """Remove duplicate rules and return the trimmed rules list."""
+ rules = self.rules[:]
+ if not rules:
+ return rules
+ rules.sort()
+
+ before = rules[0]
+ for i in rules[1:]:
+ if i == before:
+ rules.remove(i)
+ before = i
+
+ return rules
+
+ def add_rule(self, **kwargs):
+ """Inserts a new rule in this affix class"""
+ new_rule = Affix()
+ new_rule.__dict__.update(kwargs)
+ assert "group" not in kwargs
+ new_rule.group = self
+ self.rules.append(new_rule)
+ return new_rule
+
+class Affix(object):
+ """A single affix rule with all its options"""
+
+ group = None
+ """The AffixGroup that this affix belongs to"""
+ strip = ""
+ """The stripping characters from beginning (at prefix rules) or end (at suffix rules) of the word"""
+ affix = ""
+ """The appended / prepended characters"""
+ condition = ""
+ """The simplified regex to indicate when this affix applies"""
+ continuation_classes = []
+ """list of AffixGroups with which this affix can combine"""
+ needaffix = False
+ """Whether combination with another affix is needed"""
+ circumfix = False
+ """Whether to allow affixation as a circumfix"""
+ morphology = ""
+ """Extra information about morphology"""
+
+ def __init__(self):
+ self.group = None
+ self.strip = ""
+ self.affix = ""
+ self.condition = ""
+ self.continuation_classes = []
+ self.needaffix = False
+ self.circumfix = False
+ self.morphology = ""
+
+ def __cmp__(self, other):
+ ret = 0
+ for column in ("affix", "strip", "condition", "morphology"):
+ a = getattr(self, column, "")
+ b = getattr(other, column, "")
+ ret = cmp(a, b)
+ if ret:
+ return ret
+ return ret
+
+ def __str__(self):
+ return "(group=%s, strip=%s, affix=%s, condition=%s)" % \
+ (self.group.flag, self.strip, self.affix, self.condition)
+
+ def hunspell(self, options):
+ """Output this rule as a single line for the Hunspell affix file"""
+ option = self.group.suffix and "SFX" or "PFX"
+ flagname = self.group.flag
+ strip = self.strip or "0"
+ affix = self.affix or "0"
+ condition = self.condition or "."
+ continuation_classes = "".join([group.flag for group in self.continuation_classes])
+ if self.needaffix:
+ assert options.needaffix
+ continuation_classes += options.needaffix
+ if self.circumfix:
+ assert options.circumfix
+ continuation_classes += options.circumfix
+ if continuation_classes:
+ affix += "/" + continuation_classes
+ return "%s %s %s %s %s %s" % \
+ (option, flagname, strip, affix, condition, self.morphology)
+
+
+if __name__ == '__main__':
+ # This is mainly meant as an example and a simple test
+ aff = HunspellOptions()
+ aff.header = "Comment at the top"
+ aff.try_list = "abc"
+ aff.replace_list = [["coow", "cow"], ["phuliezeman", "please man"]]
+
+ group_a = aff.add_group(flag="A")
+ group_a.add_rule(affix="s")
+ group_a.add_rule(affix="ces", condition="ix", strip="x", morphology="funny_plural")
+ group_a.add_rule(affix="s")
+
+ group_b = aff.add_group(flag="B")
+ rule = group_b.add_rule(continuation_classes=[group_a])
+
+ print aff.hunspell()
+
Property changes on: trunk/dict/zu/hunspell/hunspell_format.py
___________________________________________________________________
Added: svn:executable
+ *
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|