[Docutils-checkins] SF.net SVN: docutils:[9109] trunk/docutils

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 9109
          http://sourceforge.net/p/docutils/code/9109
Author:   milde
Date:     2022-07-08 21:30:53 +0000 (Fri, 08 Jul 2022)
Log Message:
-----------
Detail planned input-encoding changes.

Modified Paths:
--------------
    trunk/docutils/RELEASE-NOTES.txt
    trunk/docutils/docutils/io.py
    trunk/docutils/test/test_io.py

Modified: trunk/docutils/RELEASE-NOTES.txt
===================================================================

--- trunk/docutils/RELEASE-NOTES.txt	2022-07-06 14:00:22 UTC (rev 9108)
+++ trunk/docutils/RELEASE-NOTES.txt	2022-07-08 21:30:53 UTC (rev 9109)
@@ -59,6 +59,8 @@
     default and locale encoding, fail.
 
   - Only remove BOM (U+FEFF ZWNBSP at start of data), no other ZWNBSPs.
+    Only remove BOM with `input_encoding` values None, '', 'utf-8-sig',
+    'utf-16', and 'utf-32'.
 
 * `html5` writer:
 

Modified: trunk/docutils/docutils/io.py
===================================================================
--- trunk/docutils/docutils/io.py	2022-07-06 14:00:22 UTC (rev 9108)
+++ trunk/docutils/docutils/io.py	2022-07-08 21:30:53 UTC (rev 9109)
@@ -122,6 +122,17 @@
             locale.setlocale(locale.LC_ALL, '')
 
         Raise UnicodeError if unsuccessful.
+
+        Provisional:
+          - Raise UnicodeError (instead of falling back to the locale
+            encoding) if decoding the source with the default encoding (UTF-8)
+            fails and Python is started in `UTF-8 mode`.
+
+            Raise UnicodeError (instead of falling back to "latin1") if both,
+            default and locale encoding, fail.
+
+          - Only remove BOM (U+FEFF ZWNBSP at start of data),
+            no other ZWNBSPs.
         """
         if self.encoding and self.encoding.lower() == 'unicode':
             assert isinstance(data, str), ('input encoding is "unicode" '
@@ -156,7 +167,8 @@
                 decoded = str(data, enc, self.error_handler)
                 self.successful_encoding = enc
                 # Return decoded, removing BOM and other ZWNBSPs.
-                # TODO: only remove BOM (ZWNBSP at start of data, API change).
+                # TODO: only remove BOM (ZWNBSP at start of data)
+                #       and only if 'self.encoding' is None. (API change)
                 return decoded.replace('\ufeff', '')
             except (UnicodeError, LookupError) as err:
                 # keep exception instance for use outside of the "for" loop.

Modified: trunk/docutils/test/test_io.py
===================================================================
--- trunk/docutils/test/test_io.py	2022-07-06 14:00:22 UTC (rev 9108)
+++ trunk/docutils/test/test_io.py	2022-07-08 21:30:53 UTC (rev 9109)
@@ -81,14 +81,13 @@
 
     def test_bom(self):
         # Provisional:
-        # TODO only remove BOM at start of data
-        input = io.StringInput(source=b'\xef\xbb\xbf foo \xef\xbb\xbf bar',
-                               encoding='utf-8')
-        # Assert BOMs are gone.
+        input = io.StringInput(source=b'\xef\xbb\xbf foo \xef\xbb\xbf bar')
+        # Assert BOM is gone.
+        # TODO: only remove BOM (ZWNBSP at start of data)
         self.assertEqual(input.read(), ' foo  bar')
-        # With unicode input:
+        # Unicode input is left unchanged:
         input = io.StringInput(source='\ufeff foo \ufeff bar')
-        # Assert BOMs are still there.
+        # Assert ZWNBSPs are still there.
         self.assertEqual(input.read(), '\ufeff foo \ufeff bar')
 
     def test_coding_slug(self):

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.