|
From: <fri...@us...> - 2008-12-23 15:07:07
|
Revision: 9626
http://zaf.svn.sourceforge.net/zaf/?rev=9626&view=rev
Author: friedelwolff
Date: 2008-12-23 15:07:03 +0000 (Tue, 23 Dec 2008)
Log Message:
-----------
Fork myspell implementation to start working on hunspell specific things
Added Paths:
-----------
trunk/dict/zu/hunspell/
trunk/dict/zu/hunspell/zu_aff.py
Copied: trunk/dict/zu/hunspell/zu_aff.py (from rev 9624, trunk/dict/zu/myspell/zu_aff.py)
===================================================================
--- trunk/dict/zu/hunspell/zu_aff.py (rev 0)
+++ trunk/dict/zu/hunspell/zu_aff.py 2008-12-23 15:07:03 UTC (rev 9626)
@@ -0,0 +1,358 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright 2008 Friedel Wolff
+#
+# This file is part of Virtaal.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""This module creates an affix file for a Zulu spell checker."""
+
+import re
+
+subject_concords = ["ngi", "u", "si", "ni", "u", "ba", "i", "li", "a", "si", "zi", "lu", "bu", "ku"]
+relative_prefixes = ["engi", "esi", "eni", "o", "aba", "e", "eli", "a", "esi", "ezi", "olu", "obu", "oku"]
+situative_prefixes = ["e", "be"]
+concords = subject_concords + relative_prefixes + situative_prefixes
+object_concords = ["ngi", "ku", "si", "ni", "m", "ba", "wu", "yi", "li", "wa", "zi", "lu", "bu"]
+
+a_rules = [["a", "Y", "PFX"]]
+"""prefixes only applicable to verbs ending on -a"""
+A_rules = [["A", "Y", "PFX"]]
+"""prefixes for almost all positive verbs and future negatives"""
+V_rules = [["V", "Y", "SFX"]]
+"""common suffixes for verbs ending on -a"""
+rules = [a_rules, A_rules, V_rules]
+"""all rules"""
+
+#These regular expressions will be used to do search-replace palatalisation. It
+#is defined outside the function so that it only needs to be done once. It is
+#crucial that the palatalisations starting on 'm' be listed first. Otherwise
+#the rules for the m-less forms will fire first. Hash signs indicate the more
+#common ones.
+
+palatalisation_list = [
+ ["mbw", "njw"],#
+ ["mpw", "ntshw"],
+ ["mw", "nyw"], #
+ ["bw", "tshw"],
+ ["bhw", "jw"],
+ ["phw", "shw"],#
+]
+
+palatalisation_re_list = []
+for palatalisation in palatalisation_list:
+ palatalisation_re_list.append([re.compile(palatalisation[0]), palatalisation[1]])
+
+def palatalise(string):
+ """Perform palatalisation substitutions on the given string."""
+ new_string = string
+ for palatalisation_re in palatalisation_re_list:
+ new_string = palatalisation_re[0].sub(palatalisation_re[1], new_string)
+ return new_string
+
+replace_list = [
+ ["bh", "b"],
+ ["b", "bh"],
+ ["ch", "c"],
+ ["c", "ch"],
+ ["dh", "d"],
+ # dh doesn't occur
+ ["hh", "h"],
+ ["h", "hh"],
+ ["kh", "k"],
+ ["k", "kh"],
+ ["k", "g"],
+ ["g", "k"],
+ ["mg", "mng"],
+ ["g", "k"],
+ ["ph", "p"],
+ ["p", "ph"],
+ ["qh", "q"],
+ ["q", "qh"],
+ ["th", "t"],
+ ["t", "th"],
+ ["xh", "x"],
+ ["x", "xh"],
+]
+"""These are suggestions for replacement."""
+
+replace_list.extend(palatalisation_list)
+
+def illegal_reflexive(subject, object):
+ """Returns whether using the given concords together would result in an
+ illegal reflexive
+
+ Although the object concord will always change to 'zi' if reflexivity
+ is intended, most combinations are actually valid for non-reflexive
+ usage. Example:
+ Umama uyamsiza (ubaba).
+ Both subject and object are class 1A nouns, but since since they refer
+ to different entities, the word is valid. Reflexives are not handled
+ explicitly anyware since they are undistinguishable from the case where
+ the object is in the 'zi' class. There are therefore actually only two
+ illegal cases.
+ """
+ if subject != object:
+ return False
+ if subject == "ngi":
+ return True
+ if subject == "ni":
+ return True
+ return False
+
+
+def add_semivowels(prefix):
+ """Handle the insertion of semi-vowels (y or w) between vowels."""
+ prefix = re.sub(r"([aeiou])a", r"\1wa", prefix)
+ #TODO: maybe iya?
+ prefix = re.sub(r"([aeiou])i", r"\1yi", prefix)
+ prefix = re.sub(r"([aeiou])u", r"\1wu", prefix)
+ return prefix
+
+def contract(prefix):
+ """Contract two vowels into a single vowel."""
+ prefix = re.sub(r"[aeiou]([aeiou])", r"\1", prefix)
+ return prefix
+
+def verb_rules(prefix):
+ """Generate the necessary rules to prepend the given prefix to a verb
+
+ It receives a string with the already built (complete) prefix and
+ returns a list of lists, with each list presenting one affix rule. The
+ consequence of vowel verbs are taken into account here, and no users of
+ this function need to take vowel verbs into account.
+ """
+ changed = []
+ #normal verb starting on consonant:
+ changed.append(["0", prefix, "[^y]"])
+ #monosyllabic verbs: in the dictionary in imperative form, e.g. yidla
+ changed.append(["yi", prefix, "yi"])
+
+ #now for the complicated part: verbs starting on vowels. (e.g. yakha)
+ if prefix[-1] == 'u':
+ #the 'u' always needs to be removed, we probably need a 'w'
+ prefix = prefix[0:-1]
+ #if the original prefix ended on 'wu', we don't want to add
+ #another 'w' as this will result in 'ww'
+ if len(prefix) > 0 and prefix[-1] == 'w':
+ changed.append(["y", prefix, "y[ae]"])
+ else:
+ changed.append(["y", prefix + 'w', "y[ae]"])
+ if len(prefix) == 0:
+ #if prefix == 'u' before 'o' we change to 'w' as above
+ changed.append(["y", 'w', "yo"])
+ else:
+ #for a prefix ending on 'u' before 'o' we simply remove the 'u'
+ #without adding 'w', e.g. lu + osa -> losa
+ changed.append(["y", prefix, "yo"])
+ return changed
+ if prefix[-1] == 'i':
+ #if the complete prefix is 'i' before a vowel verb, we can
+ #ignore it, as it is simply the same as the imperative form
+ #example: i + enza = yenza
+ #
+ #otherwise, change 'i' to 'y'
+ if len(prefix) > 1:
+ prefix = prefix[0:-1]
+ changed.append(["y", prefix, "y[^i]"])
+ #changed.append(["y", prefix, "y"])
+ #TODO: this just made the "yi", prefix "yi" rule (above) unnecessary
+ return changed
+ if prefix[-1] == 'a':
+ prefix = prefix[0:-1]
+ changed.append(["y", prefix, "y[^i]"])
+ #changed.append(["y", prefix, "y"])
+ #TODO: this just made the "yi", prefix "yi" rule (above) unnecessary
+ return changed
+ if prefix == "o":
+ #Although other prefixes can end on 'o' ('zo' or 'yo'), these
+ #are only used with consonant verbs and not with monosyllabics.
+ #TODO: verify if others are possible
+ #TODO: verify if 'zo' and 'yo' are only used before consonants
+
+ return changed
+ if prefix[-1] == "e":
+ #TODO: situative e- and be- should be handled (entirely thrown away?).
+ #We can probably just generate them, and duplicate handling should take
+ #care of them.
+ pass
+ return changed
+
+def quicksort(l):
+ if l == []:
+ return []
+ return quicksort([x for x in l[1:] if x[1] < l[0][1]]) + \
+ l[0:1] + \
+ quicksort([x for x in l[1:] if x[1] >= l[0][1]])
+
+def remove_duplicates(rules):
+ """Remove duplicate rules and return the trimmed rules list."""
+ rules = [rules[0]] + quicksort(rules[1:])
+
+ before = rules[0]
+ for i in rules[1:]:
+ if i == before:
+ rules.remove(i)
+ before = i
+
+ return rules
+
+def output_myspell():
+ """Output the generated rules in the format required for a myspell affix
+ file."""
+ print """# Automatically generated by zu_aff.py"
+SET ISO8859-1
+TRY aeinulkhosbgywmztdpfcqrvj-ASJMHxEKBGNPTRLDIZFOUWVYC
+
+"""
+ print "REP %d" % len(replace_list)
+ for rep in replace_list:
+ print "REP %s %s" % (rep[0], rep[1].replace(' ', '_'))
+ print
+
+ for rule_set in rules:
+ identifier = rule_set[0][0]
+ rule_set[0][0] = ''
+ affix_type = rule_set[0][2]
+ rule_set[0][2] = str(len(rule_set)-1)
+ #remember that the first element does not count
+ for rule in rule_set:
+ print affix_type + " " + identifier + ' ' + rule[0],
+ if len(rule[1]) > 0:
+ print rule[1],
+ else:
+ print "0",
+ print rule[2]
+ print
+
+################################################################################
+
+for i in concords:
+ A_rules.extend(verb_rules(i))
+
+ A_rules.extend(verb_rules(i+"nga"))
+ A_rules.extend(verb_rules(i+"sa"))
+
+ #Future tenses:
+ a_rules.extend(verb_rules(i+"zo"))
+ a_rules.extend(verb_rules(i+"zoku"))
+ a_rules.extend(verb_rules(i+"yo"))
+ a_rules.extend(verb_rules(i+"yoku"))
+
+ #-sa- + future tenses:
+ a_rules.extend(verb_rules(i+"sazo"))
+ a_rules.extend(verb_rules(i+"sazoku"))
+ a_rules.extend(verb_rules(i+"sayo"))
+ a_rules.extend(verb_rules(i+"sayoku"))
+
+ for j in object_concords:
+ if illegal_reflexive(i, j):
+ continue
+ A_rules.extend(verb_rules(i+j))
+ A_rules.extend(verb_rules(i+"nga"+j)) #confusable with negatives
+ A_rules.extend(verb_rules(i+"sa"+j))
+
+ #Future tenses:
+ a_rules.extend(verb_rules(i+"zo"+j))
+ a_rules.extend(verb_rules(i+"zoku"+j))
+ a_rules.extend(verb_rules(i+"yo"+j))
+ a_rules.extend(verb_rules(i+"yoku"+j))
+
+ #-sa- + future tenses:
+ a_rules.extend(verb_rules(i+"sazo"+j))
+ a_rules.extend(verb_rules(i+"sazoku"+j))
+ a_rules.extend(verb_rules(i+"sayo"+j))
+ a_rules.extend(verb_rules(i+"sayoku"+j))
+
+#Mode specific ones:
+for i in subject_concords:
+ #Indicative:
+ a_rules.extend(verb_rules(i+"ya"))
+ #TODO: be- and se- forms
+
+ #Negative future tenses:
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zu"))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zuku"))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yu"))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yuku"))
+
+ #-ka- + negative future tenses:
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazu"))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazuku"))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayu"))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayuku"))
+
+ #Remote past tense:
+ a_rules.extend(verb_rules(contract(i+"a")))
+
+ for j in object_concords:
+ if illegal_reflexive(i, j):
+ continue
+ #Indicative:
+ a_rules.extend(verb_rules(i + "ya" + j))
+ #Infinitive
+ a_rules.extend(verb_rules("uku" + j))
+ a_rules.extend(verb_rules("uku" + j))
+ a_rules.extend(verb_rules("uku" + j))
+ a_rules.extend(verb_rules("uku" + j))
+
+ #Negative future tenses:
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zu"+j))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zuku"+j))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yu"+j))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yuku"+j))
+
+ #-ka- + negative future tenses:
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazu"+j))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazuku"+j))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayu"+j))
+ a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayuku"+j))
+
+ #TODO: be- and se- forms
+
+ #Remote past tense:
+ a_rules.extend(verb_rules(contract(i+"a")+j))
+
+
+a_rules.extend(verb_rules("loku"))
+a_rules.extend(verb_rules("ngoku"))
+a_rules.extend(verb_rules("noku"))
+
+#Lines below indicated with hashes will cause incorect imperatives
+V_rules.append(['a', 'ela', 'a'])
+V_rules.append(['a', 'elani', 'a'])
+V_rules.append(['a', 'elaphi', 'a']) #
+V_rules.append(['a', 'eka', 'a'])
+V_rules.append(['a', 'ekana', 'a'])
+V_rules.append(['a', 'ekani', 'a'])
+V_rules.append(['a', 'ekaphi', 'a']) #
+V_rules.append(['a', 'isa', 'a'])
+V_rules.append(['a', 'isana', 'a'])
+V_rules.append(['a', 'isani', 'a'])
+V_rules.append(['a', 'isaphi', 'a']) #
+V_rules.append(['0', 'na', 'a'])
+V_rules.append(['0', 'ni', 'a'])
+V_rules.append(['0', 'phi', 'a']) #
+V_rules.append(['a', 'wa', '[^w]a'])
+#The above can create problems with monosyllabic verbs
+V_rules.append(['a', 'wani', '[^w]a'])
+V_rules.append(['a', 'waphi', '[^w]a'])
+#The above can also be used with negatives, can't they? As in: Igama alipelwa.
+
+for i in range(len(rules)):
+ rules[i] = remove_duplicates(rules[i])
+
+output_myspell()
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|