[Zaf-commit] SF.net SVN: zaf:[9626] trunk/dict/zu

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 9626
          http://zaf.svn.sourceforge.net/zaf/?rev=9626&view=rev
Author:   friedelwolff
Date:     2008-12-23 15:07:03 +0000 (Tue, 23 Dec 2008)

Log Message:
-----------
Fork myspell implementation to start working on hunspell specific things

Added Paths:
-----------
    trunk/dict/zu/hunspell/
    trunk/dict/zu/hunspell/zu_aff.py

Copied: trunk/dict/zu/hunspell/zu_aff.py (from rev 9624, trunk/dict/zu/myspell/zu_aff.py)
===================================================================

--- trunk/dict/zu/hunspell/zu_aff.py	                        (rev 0)
+++ trunk/dict/zu/hunspell/zu_aff.py	2008-12-23 15:07:03 UTC (rev 9626)
@@ -0,0 +1,358 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright 2008 Friedel Wolff
+#
+# This file is part of Virtaal.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""This module creates an affix file for a Zulu spell checker."""
+
+import re
+
+subject_concords = ["ngi", "u", "si", "ni", "u", "ba", "i", "li", "a", "si", "zi", "lu", "bu", "ku"]
+relative_prefixes = ["engi", "esi", "eni", "o", "aba", "e", "eli", "a", "esi", "ezi", "olu", "obu", "oku"]
+situative_prefixes = ["e", "be"]
+concords = subject_concords + relative_prefixes + situative_prefixes
+object_concords = ["ngi", "ku", "si", "ni", "m", "ba", "wu", "yi", "li", "wa", "zi", "lu", "bu"]
+
+a_rules = [["a", "Y", "PFX"]]
+"""prefixes only applicable to verbs ending on -a"""
+A_rules = [["A", "Y", "PFX"]]
+"""prefixes for almost all positive verbs and future negatives"""
+V_rules = [["V", "Y", "SFX"]]
+"""common suffixes for verbs ending on -a"""
+rules = [a_rules, A_rules, V_rules]
+"""all rules"""
+
+#These regular expressions will be used to do search-replace palatalisation. It
+#is defined outside the function so that it only needs to be done once. It is 
+#crucial that the palatalisations starting on 'm' be listed first. Otherwise 
+#the rules for the m-less forms will fire first. Hash signs indicate the more 
+#common ones.
+
+palatalisation_list = [
+        ["mbw", "njw"],#
+        ["mpw", "ntshw"],
+        ["mw", "nyw"], #
+        ["bw", "tshw"],
+        ["bhw", "jw"],
+        ["phw", "shw"],#
+]
+
+palatalisation_re_list = []
+for palatalisation in palatalisation_list:
+    palatalisation_re_list.append([re.compile(palatalisation[0]), palatalisation[1]])
+
+def palatalise(string):
+    """Perform palatalisation substitutions on the given string."""
+    new_string = string
+    for palatalisation_re in palatalisation_re_list:
+        new_string = palatalisation_re[0].sub(palatalisation_re[1], new_string)
+    return new_string
+
+replace_list = [
+        ["bh", "b"],
+        ["b", "bh"],
+        ["ch", "c"],
+        ["c", "ch"],
+        ["dh", "d"],
+        # dh doesn't occur
+        ["hh", "h"],
+        ["h", "hh"],
+        ["kh", "k"],
+        ["k", "kh"],
+        ["k", "g"],
+        ["g", "k"],
+        ["mg", "mng"],
+        ["g", "k"],
+        ["ph", "p"],
+        ["p", "ph"],
+        ["qh", "q"],
+        ["q", "qh"],
+        ["th", "t"],
+        ["t", "th"],
+        ["xh", "x"],
+        ["x", "xh"],
+]
+"""These are suggestions for replacement."""
+
+replace_list.extend(palatalisation_list)
+
+def illegal_reflexive(subject, object):
+    """Returns whether using the given concords together would result in an 
+    illegal reflexive
+
+    Although the object concord will always change to 'zi' if reflexivity 
+    is intended, most combinations are actually valid for non-reflexive 
+    usage. Example:
+        Umama uyamsiza (ubaba).
+    Both subject and object are class 1A nouns, but since since they refer 
+    to different entities, the word is valid. Reflexives are not handled 
+    explicitly anyware since they are undistinguishable from the case where 
+    the object is in the 'zi' class. There are therefore actually only two 
+    illegal cases.
+    """
+    if subject != object:
+        return False
+    if subject == "ngi":
+        return True
+    if subject == "ni":
+        return True
+    return False
+
+
+def add_semivowels(prefix):
+    """Handle the insertion of semi-vowels (y or w) between vowels."""
+    prefix = re.sub(r"([aeiou])a", r"\1wa", prefix)
+    #TODO: maybe iya?
+    prefix = re.sub(r"([aeiou])i", r"\1yi", prefix)
+    prefix = re.sub(r"([aeiou])u", r"\1wu", prefix)
+    return prefix
+
+def contract(prefix):
+    """Contract two vowels into a single vowel."""
+    prefix = re.sub(r"[aeiou]([aeiou])", r"\1", prefix)
+    return prefix
+
+def verb_rules(prefix):
+    """Generate the necessary rules to prepend the given prefix to a verb
+
+    It receives a string with the already built (complete) prefix and 
+    returns a list of lists, with each list presenting one affix rule. The 
+    consequence of vowel verbs are taken into account here, and no users of 
+    this function need to take vowel verbs into account.
+    """
+    changed = []
+    #normal verb starting on consonant:
+    changed.append(["0", prefix, "[^y]"])
+    #monosyllabic verbs: in the dictionary in imperative form, e.g. yidla
+    changed.append(["yi", prefix, "yi"])
+
+    #now for the complicated part: verbs starting on vowels. (e.g. yakha)
+    if prefix[-1] == 'u':
+        #the 'u' always needs to be removed, we probably need a 'w'
+        prefix = prefix[0:-1]
+        #if the original prefix ended on 'wu', we don't want to add 
+        #another 'w' as this will result in 'ww'
+        if len(prefix) > 0 and prefix[-1] == 'w':
+            changed.append(["y", prefix, "y[ae]"])
+        else:
+            changed.append(["y", prefix + 'w', "y[ae]"])
+        if len(prefix) == 0:
+            #if prefix == 'u' before 'o' we change to 'w' as above
+            changed.append(["y", 'w', "yo"])
+        else:
+            #for a prefix ending on 'u' before 'o' we simply remove the 'u' 
+            #without adding 'w', e.g. lu + osa -> losa
+            changed.append(["y", prefix, "yo"])
+        return changed
+    if prefix[-1] == 'i':
+        #if the complete prefix is 'i' before a vowel verb, we can 
+        #ignore it, as it is simply the same as the imperative form
+        #example: i + enza = yenza
+        #
+        #otherwise, change 'i' to 'y'
+        if len(prefix) > 1: 
+            prefix = prefix[0:-1]
+            changed.append(["y", prefix, "y[^i]"])
+            #changed.append(["y", prefix, "y"])
+            #TODO: this just made the "yi", prefix "yi" rule (above) unnecessary
+        return changed
+    if prefix[-1] == 'a':
+        prefix = prefix[0:-1]
+        changed.append(["y", prefix, "y[^i]"])
+        #changed.append(["y", prefix, "y"])
+        #TODO: this just made the "yi", prefix "yi" rule (above) unnecessary
+        return changed
+    if prefix == "o":
+        #Although other prefixes can end on 'o' ('zo' or 'yo'), these
+        #are only used with consonant verbs and not with monosyllabics.
+        #TODO: verify if others are possible
+        #TODO: verify if 'zo' and 'yo' are only used before consonants
+
+        return changed
+    if prefix[-1] == "e":
+        #TODO: situative e- and be- should be handled (entirely thrown away?). 
+        #We can probably just generate them, and duplicate handling should take
+        #care of them.
+        pass
+    return changed
+
+def quicksort(l):
+    if l == []:
+        return []
+    return quicksort([x for x in l[1:] if x[1] < l[0][1]]) + \
+        l[0:1] + \
+        quicksort([x for x in l[1:] if x[1] >= l[0][1]])
+
+def remove_duplicates(rules):
+    """Remove duplicate rules and return the trimmed rules list."""
+    rules = [rules[0]] + quicksort(rules[1:])
+
+    before = rules[0]
+    for i in rules[1:]:
+        if i == before:
+            rules.remove(i)
+        before = i
+
+    return rules
+
+def output_myspell():
+    """Output the generated rules in the format required for a myspell affix 
+    file."""
+    print """# Automatically generated by zu_aff.py"
+SET ISO8859-1
+TRY aeinulkhosbgywmztdpfcqrvj-ASJMHxEKBGNPTRLDIZFOUWVYC
+
+"""
+    print "REP %d" % len(replace_list)
+    for rep in replace_list:
+        print "REP %s %s" % (rep[0], rep[1].replace(' ', '_'))
+    print
+
+    for rule_set in rules:
+        identifier = rule_set[0][0]
+        rule_set[0][0] = ''
+        affix_type = rule_set[0][2]
+        rule_set[0][2] = str(len(rule_set)-1)
+        #remember that the first element does not count
+        for rule in rule_set:
+            print affix_type + " " + identifier + ' ' + rule[0],
+            if len(rule[1]) > 0:
+                print rule[1],
+            else:
+                print "0",
+            print rule[2]
+        print
+
+################################################################################
+
+for i in concords:
+    A_rules.extend(verb_rules(i))
+
+    A_rules.extend(verb_rules(i+"nga"))
+    A_rules.extend(verb_rules(i+"sa"))
+
+    #Future tenses:
+    a_rules.extend(verb_rules(i+"zo"))
+    a_rules.extend(verb_rules(i+"zoku"))
+    a_rules.extend(verb_rules(i+"yo"))
+    a_rules.extend(verb_rules(i+"yoku"))
+
+    #-sa- + future tenses:
+    a_rules.extend(verb_rules(i+"sazo"))
+    a_rules.extend(verb_rules(i+"sazoku"))
+    a_rules.extend(verb_rules(i+"sayo"))
+    a_rules.extend(verb_rules(i+"sayoku"))
+
+    for j in object_concords:
+        if illegal_reflexive(i, j):
+            continue
+        A_rules.extend(verb_rules(i+j))
+        A_rules.extend(verb_rules(i+"nga"+j)) #confusable with negatives
+        A_rules.extend(verb_rules(i+"sa"+j))
+
+        #Future tenses:
+        a_rules.extend(verb_rules(i+"zo"+j))
+        a_rules.extend(verb_rules(i+"zoku"+j))
+        a_rules.extend(verb_rules(i+"yo"+j))
+        a_rules.extend(verb_rules(i+"yoku"+j))
+
+        #-sa- + future tenses:
+        a_rules.extend(verb_rules(i+"sazo"+j))
+        a_rules.extend(verb_rules(i+"sazoku"+j))
+        a_rules.extend(verb_rules(i+"sayo"+j))
+        a_rules.extend(verb_rules(i+"sayoku"+j))
+
+#Mode specific ones:
+for i in subject_concords:
+    #Indicative:
+    a_rules.extend(verb_rules(i+"ya"))
+    #TODO: be- and se- forms
+
+    #Negative future tenses:
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zu"))
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zuku"))
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yu"))
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yuku"))
+
+    #-ka- + negative future tenses:
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazu"))
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazuku"))
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayu"))
+    a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayuku"))
+
+    #Remote past tense:
+    a_rules.extend(verb_rules(contract(i+"a")))
+
+    for j in object_concords:
+        if illegal_reflexive(i, j):
+            continue
+        #Indicative:
+        a_rules.extend(verb_rules(i + "ya" + j))
+        #Infinitive
+        a_rules.extend(verb_rules("uku" + j))
+        a_rules.extend(verb_rules("uku" + j))
+        a_rules.extend(verb_rules("uku" + j))
+        a_rules.extend(verb_rules("uku" + j))
+
+        #Negative future tenses:
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zu"+j))
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"zuku"+j))
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yu"+j))
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"yuku"+j))
+
+        #-ka- + negative future tenses:
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazu"+j))
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kazuku"+j))
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayu"+j))
+        a_rules.extend(verb_rules(add_semivowels("a"+ i) +"kayuku"+j))
+
+        #TODO: be- and se- forms
+
+        #Remote past tense:
+        a_rules.extend(verb_rules(contract(i+"a")+j))
+
+
+a_rules.extend(verb_rules("loku"))
+a_rules.extend(verb_rules("ngoku"))
+a_rules.extend(verb_rules("noku"))
+
+#Lines below indicated with hashes will cause incorect imperatives
+V_rules.append(['a', 'ela', 'a'])
+V_rules.append(['a', 'elani', 'a'])
+V_rules.append(['a', 'elaphi', 'a'])    #
+V_rules.append(['a', 'eka', 'a'])
+V_rules.append(['a', 'ekana', 'a'])
+V_rules.append(['a', 'ekani', 'a'])
+V_rules.append(['a', 'ekaphi', 'a'])    #
+V_rules.append(['a', 'isa', 'a'])
+V_rules.append(['a', 'isana', 'a'])
+V_rules.append(['a', 'isani', 'a'])
+V_rules.append(['a', 'isaphi', 'a'])    #
+V_rules.append(['0', 'na', 'a'])
+V_rules.append(['0', 'ni', 'a'])
+V_rules.append(['0', 'phi', 'a'])       #
+V_rules.append(['a', 'wa', '[^w]a'])
+#The above can create problems with monosyllabic verbs
+V_rules.append(['a', 'wani', '[^w]a'])
+V_rules.append(['a', 'waphi', '[^w]a'])
+#The above can also be used with negatives, can't they? As in: Igama alipelwa.
+
+for i in range(len(rules)):
+    rules[i] = remove_duplicates(rules[i])
+
+output_myspell()


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.