Thread: [Docutils-checkins] r5653 - in trunk/docutils: HISTORY.txt docutils/nodes.py test/test_nodes.py

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Author: grubert
Date: 2008-09-28 08:25:13 +0200 (Sun, 28 Sep 2008)
New Revision: 5653

Modified:
   trunk/docutils/HISTORY.txt
   trunk/docutils/docutils/nodes.py
   trunk/docutils/test/test_nodes.py
Log:
Apply [ 1878977 ] make_id(): deaccent characters.


Modified: trunk/docutils/HISTORY.txt
===================================================================

--- trunk/docutils/HISTORY.txt	2008-09-25 09:49:47 UTC (rev 5652)
+++ trunk/docutils/HISTORY.txt	2008-09-28 06:25:13 UTC (rev 5653)
@@ -19,6 +19,7 @@
 
 * General:
 
+  - Apply [ 1878977 ] make_id(): deaccent characters.
   - Backwards-compatible changes to remove python2.6 -3 deprecation warnings
   - Text nodes now subclass unicode rather than UserString
     (which is gone in python 3.0)

Modified: trunk/docutils/docutils/nodes.py
===================================================================
--- trunk/docutils/docutils/nodes.py	2008-09-25 09:49:47 UTC (rev 5652)
+++ trunk/docutils/docutils/nodes.py	2008-09-28 06:25:13 UTC (rev 5653)
@@ -27,8 +27,8 @@
 import warnings
 from types import IntType, SliceType, StringType, UnicodeType, \
      TupleType, ListType, ClassType, TypeType
+import unicodedata
 
-
 # ==============================
 #  Functional Node Base Classes
 # ==============================
@@ -1787,12 +1787,69 @@
     .. _HTML 4.01 spec: http://www.w3.org/TR/html401
     .. _CSS1 spec: http://www.w3.org/TR/REC-CSS1
     """
-    id = _non_id_chars.sub('-', ' '.join(string.lower().split()))
+    id = string.lower()
+    if not isinstance(id, unicode):
+        id = id.decode()
+    try:
+        id = id.translate(_non_id_translate_digraphs)
+    except (NotImplementedError):
+        # unicode.translate(dict) does not support 1-n-mappings in Python 2.2
+        pass
+    id = id.translate(_non_id_translate)
+    try:
+        id = unicodedata.normalize('NFKD', id).encode('ASCII', 'ignore')
+    except (AttributeError):
+        # unicodedata.normalize not supported in Python 2.2
+        pass
+    # shrink runs of whitespace and replace by hyphen
+    id = _non_id_chars.sub('-', ' '.join(id.split()))
     id = _non_id_at_ends.sub('', id)
     return str(id)
 
 _non_id_chars = re.compile('[^a-z0-9]+')
 _non_id_at_ends = re.compile('^[-0-9]+|-+$')
+_non_id_translate = {
+    0x00f8: u'o',       # o with stroke
+    0x0111: u'd',       # d with stroke
+    0x0127: u'h',       # h with stroke
+    0x0131: u'i',       # dotless i
+    0x0142: u'l',       # l with stroke
+    0x0167: u't',       # t with stroke
+    0x0180: u'b',       # b with stroke
+    0x0183: u'b',       # b with topbar
+    0x0188: u'c',       # c with hook
+    0x018c: u'd',       # d with topbar
+    0x0192: u'f',       # f with hook
+    0x0199: u'k',       # k with hook
+    0x019a: u'l',       # l with bar
+    0x019e: u'n',       # n with long right leg
+    0x01a5: u'p',       # p with hook
+    0x01ab: u't',       # t with palatal hook
+    0x01ad: u't',       # t with hook
+    0x01b4: u'y',       # y with hook
+    0x01b6: u'z',       # z with stroke
+    0x01e5: u'g',       # g with stroke
+    0x0225: u'z',       # z with hook
+    0x0234: u'l',       # l with curl
+    0x0235: u'n',       # n with curl
+    0x0236: u't',       # t with curl
+    0x0237: u'j',       # dotless j
+    0x023c: u'c',       # c with stroke
+    0x023f: u's',       # s with swash tail
+    0x0240: u'z',       # z with swash tail
+    0x0247: u'e',       # e with stroke
+    0x0249: u'j',       # j with stroke
+    0x024b: u'q',       # q with hook tail
+    0x024d: u'r',       # r with stroke
+    0x024f: u'y',       # y with stroke
+}
+_non_id_translate_digraphs = {
+    0x00df: u'sz',      # ligature sz
+    0x00e6: u'ae',      # ae
+    0x0153: u'oe',      # ligature oe
+    0x0238: u'db',      # db digraph
+    0x0239: u'qp',      # qp digraph
+}
 
 def dupname(node, name):
     node['dupnames'].append(name)

Modified: trunk/docutils/test/test_nodes.py
===================================================================
--- trunk/docutils/test/test_nodes.py	2008-09-25 09:49:47 UTC (rev 5652)
+++ trunk/docutils/test/test_nodes.py	2008-09-28 06:25:13 UTC (rev 5653)
@@ -186,15 +186,189 @@
         nodes.node_class_names.sort()
         self.assertEquals(node_class_names, nodes.node_class_names)
 
-    ids = [('a', 'a'), ('A', 'a'), ('', ''), ('a b \n c', 'a-b-c'),
+    ids = [(u'a', 'a'), ('A', 'a'), ('', ''), ('a b \n c', 'a-b-c'),
            ('a.b.c', 'a-b-c'), (' - a - b - c - ', 'a-b-c'), (' - ', ''),
            (u'\u2020\u2066', ''), (u'a \xa7 b \u2020 c', 'a-b-c'),
-           ('1', ''), ('1abc', 'abc')]
+           ('1', ''), ('1abc', 'abc'),
+          ]
+    ids_unicode_all = [
+            (u'\u00f8 o with stroke', 'o-o-with-stroke'),
+            (u'\u0111 d with stroke', 'd-d-with-stroke'),
+            (u'\u0127 h with stroke', 'h-h-with-stroke'),
+            (u'\u0131 dotless i', 'i-dotless-i'),
+            (u'\u0142 l with stroke', 'l-l-with-stroke'),
+            (u'\u0167 t with stroke', 't-t-with-stroke'),
+           # From Latin Extended-B
+            (u'\u0180 b with stroke', 'b-b-with-stroke'),
+            (u'\u0183 b with topbar', 'b-b-with-topbar'),
+            (u'\u0188 c with hook', 'c-c-with-hook'),
+            (u'\u018c d with topbar', 'd-d-with-topbar'),
+            (u'\u0192 f with hook', 'f-f-with-hook'),
+            (u'\u0199 k with hook', 'k-k-with-hook'),
+            (u'\u019a l with bar', 'l-l-with-bar'),
+            (u'\u019e n with long right leg', 'n-n-with-long-right-leg'),
+            (u'\u01a5 p with hook', 'p-p-with-hook'),
+            (u'\u01ab t with palatal hook', 't-t-with-palatal-hook'),
+            (u'\u01ad t with hook', 't-t-with-hook'),
+            (u'\u01b4 y with hook', 'y-y-with-hook'),
+            (u'\u01b6 z with stroke', 'z-z-with-stroke'),
+            (u'\u01e5 g with stroke', 'g-g-with-stroke'),
+            (u'\u0225 z with hook', 'z-z-with-hook'),
+            (u'\u0234 l with curl', 'l-l-with-curl'),
+            (u'\u0235 n with curl', 'n-n-with-curl'),
+            (u'\u0236 t with curl', 't-t-with-curl'),
+            (u'\u0237 dotless j', 'j-dotless-j'),
+            (u'\u023c c with stroke', 'c-c-with-stroke'),
+            (u'\u023f s with swash tail', 's-s-with-swash-tail'),
+            (u'\u0240 z with swash tail', 'z-z-with-swash-tail'),
+            (u'\u0247 e with stroke', 'e-e-with-stroke'),
+            (u'\u0249 j with stroke', 'j-j-with-stroke'),
+            (u'\u024b q with hook tail', 'q-q-with-hook-tail'),
+            (u'\u024d r with stroke', 'r-r-with-stroke'),
+            (u'\u024f y with stroke', 'y-y-with-stroke'),
+          ]
+    ids_unicode_not_2_2 = [
+           # From Latin-1 Supplements
+            (u'\u00e0: a with grave', 'a-a-with-grave'),
+            (u'\u00e1 a with acute', 'a-a-with-acute'),
+            (u'\u00e2 a with circumflex', 'a-a-with-circumflex'),
+            (u'\u00e3 a with tilde', 'a-a-with-tilde'),
+            (u'\u00e4 a with diaeresis', 'a-a-with-diaeresis'),
+            (u'\u00e5 a with ring above', 'a-a-with-ring-above'),
+            (u'\u00e7 c with cedilla', 'c-c-with-cedilla'),
+            (u'\u00e8 e with grave', 'e-e-with-grave'),
+            (u'\u00e9 e with acute', 'e-e-with-acute'),
+            (u'\u00ea e with circumflex', 'e-e-with-circumflex'),
+            (u'\u00eb e with diaeresis', 'e-e-with-diaeresis'),
+            (u'\u00ec i with grave', 'i-i-with-grave'),
+            (u'\u00ed i with acute', 'i-i-with-acute'),
+            (u'\u00ee i with circumflex', 'i-i-with-circumflex'),
+            (u'\u00ef i with diaeresis', 'i-i-with-diaeresis'),
+            (u'\u00f1 n with tilde', 'n-n-with-tilde'),
+            (u'\u00f2 o with grave', 'o-o-with-grave'),
+            (u'\u00f3 o with acute', 'o-o-with-acute'),
+            (u'\u00f4 o with circumflex', 'o-o-with-circumflex'),
+            (u'\u00f5 o with tilde', 'o-o-with-tilde'),
+            (u'\u00f6 o with diaeresis', 'o-o-with-diaeresis'),
+            (u'\u00f9 u with grave', 'u-u-with-grave'),
+            (u'\u00fa u with acute', 'u-u-with-acute'),
+            (u'\u00fb u with circumflex', 'u-u-with-circumflex'),
+            (u'\u00fc u with diaeresis', 'u-u-with-diaeresis'),
+            (u'\u00fd y with acute', 'y-y-with-acute'),
+            (u'\u00ff y with diaeresis', 'y-y-with-diaeresis'),
+           # From Latin Extended-A
+            (u'\u0101 a with macron', 'a-a-with-macron'),
+            (u'\u0103 a with breve', 'a-a-with-breve'),
+            (u'\u0105 a with ogonek', 'a-a-with-ogonek'),
+            (u'\u0107 c with acute', 'c-c-with-acute'),
+            (u'\u0109 c with circumflex', 'c-c-with-circumflex'),
+            (u'\u010b c with dot above', 'c-c-with-dot-above'),
+            (u'\u010d c with caron', 'c-c-with-caron'),
+            (u'\u010f d with caron', 'd-d-with-caron'),
+            (u'\u0113 e with macron', 'e-e-with-macron'),
+            (u'\u0115 e with breve', 'e-e-with-breve'),
+            (u'\u0117 e with dot above', 'e-e-with-dot-above'),
+            (u'\u0119 e with ogonek', 'e-e-with-ogonek'),
+            (u'\u011b e with caron', 'e-e-with-caron'),
+            (u'\u011d g with circumflex', 'g-g-with-circumflex'),
+            (u'\u011f g with breve', 'g-g-with-breve'),
+            (u'\u0121 g with dot above', 'g-g-with-dot-above'),
+            (u'\u0123 g with cedilla', 'g-g-with-cedilla'),
+            (u'\u0125 h with circumflex', 'h-h-with-circumflex'),
+            (u'\u0129 i with tilde', 'i-i-with-tilde'),
+            (u'\u012b i with macron', 'i-i-with-macron'),
+            (u'\u012d i with breve', 'i-i-with-breve'),
+            (u'\u012f i with ogonek', 'i-i-with-ogonek'),
+            (u'\u0133 ligature ij', 'ij-ligature-ij'),
+            (u'\u0135 j with circumflex', 'j-j-with-circumflex'),
+            (u'\u0137 k with cedilla', 'k-k-with-cedilla'),
+            (u'\u013a l with acute', 'l-l-with-acute'),
+            (u'\u013c l with cedilla', 'l-l-with-cedilla'),
+            (u'\u013e l with caron', 'l-l-with-caron'),
+            (u'\u0140 l with middle dot', 'l-l-with-middle-dot'),
+            (u'\u0144 n with acute', 'n-n-with-acute'),
+            (u'\u0146 n with cedilla', 'n-n-with-cedilla'),
+            (u'\u0148 n with caron', 'n-n-with-caron'),
+            (u'\u014d o with macron', 'o-o-with-macron'),
+            (u'\u014f o with breve', 'o-o-with-breve'),
+            (u'\u0151 o with double acute', 'o-o-with-double-acute'),
+            (u'\u0155 r with acute', 'r-r-with-acute'),
+            (u'\u0157 r with cedilla', 'r-r-with-cedilla'),
+            (u'\u0159 r with caron', 'r-r-with-caron'),
+            (u'\u015b s with acute', 's-s-with-acute'),
+            (u'\u015d s with circumflex', 's-s-with-circumflex'),
+            (u'\u015f s with cedilla', 's-s-with-cedilla'),
+            (u'\u0161 s with caron', 's-s-with-caron'),
+            (u'\u0163 t with cedilla', 't-t-with-cedilla'),
+            (u'\u0165 t with caron', 't-t-with-caron'),
+            (u'\u0169 u with tilde', 'u-u-with-tilde'),
+            (u'\u016b u with macron', 'u-u-with-macron'),
+            (u'\u016d u with breve', 'u-u-with-breve'),
+            (u'\u016f u with ring above', 'u-u-with-ring-above'),
+            (u'\u0171 u with double acute', 'u-u-with-double-acute'),
+            (u'\u0173 u with ogonek', 'u-u-with-ogonek'),
+            (u'\u0175 w with circumflex', 'w-w-with-circumflex'),
+            (u'\u0177 y with circumflex', 'y-y-with-circumflex'),
+            (u'\u017a z with acute', 'z-z-with-acute'),
+            (u'\u017c z with dot above', 'z-z-with-dot-above'),
+            (u'\u017e z with caron', 'z-z-with-caron'),
+           # From Latin Extended-B
+            (u'\u01a1 o with horn', 'o-o-with-horn'),
+            (u'\u01b0 u with horn', 'u-u-with-horn'),
+            (u'\u01c6 dz with caron', 'dz-dz-with-caron'),
+            (u'\u01c9 lj', 'lj-lj'),
+            (u'\u01cc nj', 'nj-nj'),
+            (u'\u01ce a with caron', 'a-a-with-caron'),
+            (u'\u01d0 i with caron', 'i-i-with-caron'),
+            (u'\u01d2 o with caron', 'o-o-with-caron'),
+            (u'\u01d4 u with caron', 'u-u-with-caron'),
+            (u'\u01e7 g with caron', 'g-g-with-caron'),
+            (u'\u01e9 k with caron', 'k-k-with-caron'),
+            (u'\u01eb o with ogonek', 'o-o-with-ogonek'),
+            (u'\u01ed o with ogonek and macron', 'o-o-with-ogonek-and-macron'),
+            (u'\u01f0 j with caron', 'j-j-with-caron'),
+            (u'\u01f3 dz', 'dz-dz'),
+            (u'\u01f5 g with acute', 'g-g-with-acute'),
+            (u'\u01f9 n with grave', 'n-n-with-grave'),
+            (u'\u0201 a with double grave', 'a-a-with-double-grave'),
+            (u'\u0203 a with inverted breve', 'a-a-with-inverted-breve'),
+            (u'\u0205 e with double grave', 'e-e-with-double-grave'),
+            (u'\u0207 e with inverted breve', 'e-e-with-inverted-breve'),
+            (u'\u0209 i with double grave', 'i-i-with-double-grave'),
+            (u'\u020b i with inverted breve', 'i-i-with-inverted-breve'),
+            (u'\u020d o with double grave', 'o-o-with-double-grave'),
+            (u'\u020f o with inverted breve', 'o-o-with-inverted-breve'),
+            (u'\u0211 r with double grave', 'r-r-with-double-grave'),
+            (u'\u0213 r with inverted breve', 'r-r-with-inverted-breve'),
+            (u'\u0215 u with double grave', 'u-u-with-double-grave'),
+            (u'\u0217 u with inverted breve', 'u-u-with-inverted-breve'),
+            (u'\u0219 s with comma below', 's-s-with-comma-below'),
+            (u'\u021b t with comma below', 't-t-with-comma-below'),
+            (u'\u021f h with caron', 'h-h-with-caron'),
+            (u'\u0227 a with dot above', 'a-a-with-dot-above'),
+            (u'\u0229 e with cedilla', 'e-e-with-cedilla'),
+            (u'\u022f o with dot above', 'o-o-with-dot-above'),
+            (u'\u0233 y with macron', 'y-y-with-macron'),
+           # digraphs From Latin-1 Supplements
+            (u'\u00df: ligature sz', 'sz-ligature-sz'),
+            (u'\u00e6 ae', 'ae-ae'),
+            (u'\u0153 ligature oe', 'oe-ligature-oe'),
+            (u'\u0238 db digraph', 'db-db-digraph'),
+            (u'\u0239 qp digraph', 'qp-qp-digraph'),
+            ]
 
     def test_make_id(self):
-        for input, output in self.ids:
-            normed = nodes.make_id(input)
-            self.assertEquals(normed, output)
+        failures = []
+        tests = self.ids + self.ids_unicode_all
+        import sys
+        if sys.version_info[:2] != (2, 2):
+            tests += self.ids_unicode_not_2_2
+        for input, expect in tests:
+            output = nodes.make_id(input)
+            if expect != output:
+                failures.append("'%s' != '%s'" % (expect, output))
+        if failures:
+            self.fail("%d failures in %d\n%s" % (len(failures), len(self.ids), "\n".join(failures)))
 
     def test_traverse(self):
         e = nodes.Element()





Thread: [Docutils-checkins] r5653 - in trunk/docutils: HISTORY.txt docutils/nodes.py test/test_nodes.py

docutils-checkins