[Docutils-checkins] SF.net SVN: docutils:[7538] trunk/docutils

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 7538
          http://docutils.svn.sourceforge.net/docutils/?rev=7538&view=rev
Author:   milde
Date:     2012-11-23 01:18:49 +0000 (Fri, 23 Nov 2012)
Log Message:
-----------
normalize_language_tag() now returns `BCP 47`_ conformant tags

Subtags separated by ``-``, not ``_``.

Modified Paths:
--------------
    trunk/docutils/HISTORY.txt
    trunk/docutils/docutils/languages/__init__.py
    trunk/docutils/docutils/parsers/rst/languages/__init__.py
    trunk/docutils/docutils/utils/__init__.py
    trunk/docutils/docutils/writers/latex2e/__init__.py
    trunk/docutils/docutils/writers/xetex/__init__.py
    trunk/docutils/test/test_utils.py

Modified: trunk/docutils/HISTORY.txt
===================================================================

--- trunk/docutils/HISTORY.txt	2012-11-18 22:11:49 UTC (rev 7537)
+++ trunk/docutils/HISTORY.txt	2012-11-23 01:18:49 UTC (rev 7538)
@@ -43,6 +43,11 @@
 
   - Add SmartQuotes transform for typographic quotes and dashes.
 
+* docutils/utils/__init__.py
+
+  - normalize_language_tag() now returns `BCP 47`_ conformant tags
+    with subtags separated by ``-``.
+
 * docutils/writers/html4css1/__init__.py
 
   - Use ``<code>`` tag for inline "code",

Modified: trunk/docutils/docutils/languages/__init__.py
===================================================================
--- trunk/docutils/docutils/languages/__init__.py	2012-11-18 22:11:49 UTC (rev 7537)
+++ trunk/docutils/docutils/languages/__init__.py	2012-11-23 01:18:49 UTC (rev 7538)
@@ -27,6 +27,7 @@
     """
     # TODO: use a dummy module returning emtpy strings?, configurable?
     for tag in normalize_language_tag(language_code):
+        tag = tag.replace('-','_') # '-' not valid in module names
         if tag in _languages:
             return _languages[tag]
         try:

Modified: trunk/docutils/docutils/parsers/rst/languages/__init__.py
===================================================================
--- trunk/docutils/docutils/parsers/rst/languages/__init__.py	2012-11-18 22:11:49 UTC (rev 7537)
+++ trunk/docutils/docutils/parsers/rst/languages/__init__.py	2012-11-23 01:18:49 UTC (rev 7538)
@@ -22,6 +22,7 @@
 
 def get_language(language_code):
     for tag in normalize_language_tag(language_code):
+        tag = tag.replace('-','_') # '-' not valid in module names
         if tag in _languages:
             return _languages[tag]
         try:

Modified: trunk/docutils/docutils/utils/__init__.py
===================================================================
--- trunk/docutils/docutils/utils/__init__.py	2012-11-18 22:11:49 UTC (rev 7537)
+++ trunk/docutils/docutils/utils/__init__.py	2012-11-23 01:18:49 UTC (rev 7538)
@@ -12,6 +12,7 @@
 import sys
 import os
 import os.path
+import re
 import warnings
 import unicodedata
 from docutils import ApplicationError, DataError
@@ -642,20 +643,20 @@
 
     Example:
 
-      >>> normalize_language_tag('de-AT-1901')
-      ['de_at_1901', 'de_at', 'de_1901', 'de']
+    >>> normalize_language_tag('de_AT-1901')
+    ['de-at-1901', 'de-at', 'de-1901', 'de']
     """
     # normalize:
-    tag = tag.lower().replace('-','_')
+    tag = tag.lower().replace('_','-')
+    # split (except singletons, which mark the following tag as non-standard):
+    tag = re.sub(r'-([a-zA-Z0-9])-', r'-\1_', tag)
+    taglist = []
+    subtags = [subtag.replace('_', '-') for subtag in tag.split('-')]
+    base_tag = [subtags.pop(0)]
     # find all combinations of subtags
-    taglist = []
-    base_tag= tag.split('_')[:1]
-    subtags = tag.split('_')[1:]
-    # print base_tag, subtags
     for n in range(len(subtags), 0, -1):
         for tags in unique_combinations(subtags, n):
-            # print tags
-            taglist.append('_'.join(base_tag + tags))
+            taglist.append('-'.join(base_tag+tags))
     taglist += base_tag
     return taglist
 

Modified: trunk/docutils/docutils/writers/latex2e/__init__.py
===================================================================
--- trunk/docutils/docutils/writers/latex2e/__init__.py	2012-11-18 22:11:49 UTC (rev 7537)
+++ trunk/docutils/docutils/writers/latex2e/__init__.py	2012-11-23 01:18:49 UTC (rev 7538)
@@ -293,18 +293,18 @@
         'cy':           'welsh',
         'da':           'danish',
         'de':           'ngerman', # new spelling (de_1996)
-        'de_1901':      'german', # old spelling
-        'de_at':        'naustrian',
-        'de_at_1901':   'austrian',
+        'de-1901':      'german', # old spelling
+        'de-AT':        'naustrian',
+        'de-AT-1901':   'austrian',
         'dsb':          'lowersorbian',
         'el':           'greek', # monotonic (el-monoton)
-        'el_polyton':   'polutonikogreek',
+        'el-polyton':   'polutonikogreek',
         'en':           'english',  # TeX' default language
-        'en_au':        'australian',
-        'en_ca':        'canadian',
-        'en_gb':        'british',
-        'en_nz':        'newzealand',
-        'en_us':        'american',
+        'en-AU':        'australian',
+        'en-CA':        'canadian',
+        'en-GB':        'british',
+        'en-NZ':        'newzealand',
+        'en-US':        'american',
         'eo':           'esperanto', # '^' is active
         'es':           'spanish',
         'et':           'estonian',
@@ -312,10 +312,10 @@
         # 'fa':           'farsi',
         'fi':           'finnish',
         'fr':           'french',
-        'fr_ca':        'canadien',
+        'fr-CA':        'canadien',
         'ga':           'irish',    # Irish Gaelic
         # 'grc':                    # Ancient Greek
-        'grc_ibycus':   'ibycus',   # Ibycus encoding
+        'grc-ibycus':   'ibycus',   # Ibycus encoding
         'gl':           'galician',
         'he':           'hebrew',
         'hr':           'croatian',
@@ -338,24 +338,27 @@
         'no':           'norsk',     # Norwegian Bokmal
         'pl':           'polish',
         'pt':           'portuges',
-        'pt_br':        'brazil',
+        'pt-BR':        'brazil',
         'ro':           'romanian',
         'ru':           'russian',   # '"' is active
         'se':           'samin',     # North Sami
-        # sh-cyrl:      Serbo-Croatian, Cyrillic script
-        'sh-latn':      'serbian', # Serbo-Croatian, Latin script
+        # sh-Cyrl:      Serbo-Croatian, Cyrillic script
+        'sh-Latn':      'serbian', # Serbo-Croatian, Latin script
         'sk':           'slovak',
         'sl':           'slovene',
         'sq':           'albanian',
-        # 'sr-cyrl':    Serbian, Cyrillic script (sr-cyrl)
-        'sr-latn':      'serbian', # Serbian, Latin script, " active.
+        # 'sr-Cyrl':    Serbian, Cyrillic script (sr-cyrl)
+        'sr-Latn':      'serbian', # Serbian, Latin script, " active.
         'sv':           'swedish',
         # 'th':           'thai',
         'tr':           'turkish',
         'uk':           'ukrainian',
         'vi':           'vietnam',
-        # zh-latn:      Chinese Pinyin
+        # zh-Latn:      Chinese Pinyin
         }
+    # normalize (downcase) keys
+    language_codes = dict([(k.lower(), v) for (k,v) in language_codes.items()])
+
     warn_msg = 'Language "%s" not supported by LaTeX (babel)'
 
     def __init__(self, language_code, reporter=None):
@@ -1595,8 +1598,12 @@
             self.out.append( '%\n\\begin{list}{}{}\n' )
         else:
             self.out.append( '%\n\\begin{itemize}\n' )
+        # if node['classes']:
+        #     self.visit_inline(node)
 
     def depart_bullet_list(self, node):
+        # if node['classes']:
+        #     self.depart_inline(node)
         if self.is_toc_list:
             self.out.append( '\n\\end{list}\n' )
         else:

Modified: trunk/docutils/docutils/writers/xetex/__init__.py
===================================================================
--- trunk/docutils/docutils/writers/xetex/__init__.py	2012-11-18 22:11:49 UTC (rev 7537)
+++ trunk/docutils/docutils/writers/xetex/__init__.py	2012-11-23 01:18:49 UTC (rev 7538)
@@ -76,30 +76,33 @@
         # code          Polyglossia-name       comment
         'cop':          'coptic',
         'de':           'german', # new spelling (de_1996)
-        'de_1901':      'ogerman', # old spelling
+        'de-1901':      'ogerman', # old spelling
         'dv':           'divehi',  # Maldivian
         'dsb':          'lsorbian',
-        'el_polyton':   'polygreek',
+        'el-polyton':   'polygreek',
         'fa':           'farsi',
         'grc':          'ancientgreek',
         'hsb':          'usorbian',
-        'sh-cyrl':      'serbian', # Serbo-Croatian, Cyrillic script
-        'sh-latn':      'croatian', # Serbo-Croatian, Latin script
+        'sh-Cyrl':      'serbian', # Serbo-Croatian, Cyrillic script
+        'sh-Latn':      'croatian', # Serbo-Croatian, Latin script
         'sq':           'albanian',
-        'sr':           'serbian', # Cyrillic script (sr-cyrl)
+        'sr':           'serbian', # Cyrillic script (sr-Cyrl)
         'th':           'thai',
         'vi':           'vietnamese',
-        # zh-latn:      ???        #     Chinese Pinyin
+        # zh-Latn:      ???        #     Chinese Pinyin
         })
+    # normalize (downcase) keys
+    language_codes = dict([(k.lower(), v) for (k,v) in language_codes.items()])
+
     # Languages without Polyglossia support:
     for key in ('af',           # 'afrikaans',
-                'de_at',        # 'naustrian',
-                'de_at_1901',   # 'austrian',
-                'fr_ca',        # 'canadien',
-                'grc_ibycus',   # 'ibycus', (Greek Ibycus encoding)
-                'sr-latn',      # 'serbian script=latin'
+                'de-AT',        # 'naustrian',
+                'de-AT-1901',   # 'austrian',
+                'fr-CA',        # 'canadien',
+                'grc-ibycus',   # 'ibycus', (Greek Ibycus encoding)
+                'sr-Latn',      # 'serbian script=latin'
                ):
-        del(language_codes[key])
+        del(language_codes[key.lower()])
 
     def __init__(self, language_code, reporter):
         self.language_code = language_code

Modified: trunk/docutils/test/test_utils.py
===================================================================
--- trunk/docutils/test/test_utils.py	2012-11-18 22:11:49 UTC (rev 7537)
+++ trunk/docutils/test/test_utils.py	2012-11-23 01:18:49 UTC (rev 7538)
@@ -240,12 +240,15 @@
     def test_normalize_language_tag(self):
         self.assertEqual(utils.normalize_language_tag('de'), ['de'])
         self.assertEqual(utils.normalize_language_tag('de-AT'),
-                          ['de_at', 'de'])
+                         ['de-at', 'de'])
         self.assertEqual(utils.normalize_language_tag('de-AT-1901'),
-                          ['de_at_1901', 'de_at', 'de_1901', 'de'])
+                         ['de-at-1901', 'de-at', 'de-1901', 'de'])
         self.assertEqual(utils.normalize_language_tag('de-AT-1901-frak'),
-                          ['de_at_1901_frak', 'de_at_1901', 'de_at_frak',
-                          'de_1901_frak', 'de_at', 'de_1901', 'de_frak', 'de'])
+                         ['de-at-1901-frak', 'de-at-1901', 'de-at-frak',
+                          'de-1901-frak', 'de-at', 'de-1901', 'de-frak', 'de'])
+        self.assertEqual(utils.normalize_language_tag('grc-ibycus-x-altquot'),
+                         ['grc-ibycus-x-altquot', 'grc-ibycus',
+                          'grc-x-altquot', 'grc'])
 
     def test_column_width(self):
         self.assertEqual(utils.column_width(u'de'), 2)

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.