|
From: <mi...@us...> - 2022-07-08 21:30:56
|
Revision: 9109
http://sourceforge.net/p/docutils/code/9109
Author: milde
Date: 2022-07-08 21:30:53 +0000 (Fri, 08 Jul 2022)
Log Message:
-----------
Detail planned input-encoding changes.
Modified Paths:
--------------
trunk/docutils/RELEASE-NOTES.txt
trunk/docutils/docutils/io.py
trunk/docutils/test/test_io.py
Modified: trunk/docutils/RELEASE-NOTES.txt
===================================================================
--- trunk/docutils/RELEASE-NOTES.txt 2022-07-06 14:00:22 UTC (rev 9108)
+++ trunk/docutils/RELEASE-NOTES.txt 2022-07-08 21:30:53 UTC (rev 9109)
@@ -59,6 +59,8 @@
default and locale encoding, fail.
- Only remove BOM (U+FEFF ZWNBSP at start of data), no other ZWNBSPs.
+ Only remove BOM with `input_encoding` values None, '', 'utf-8-sig',
+ 'utf-16', and 'utf-32'.
* `html5` writer:
Modified: trunk/docutils/docutils/io.py
===================================================================
--- trunk/docutils/docutils/io.py 2022-07-06 14:00:22 UTC (rev 9108)
+++ trunk/docutils/docutils/io.py 2022-07-08 21:30:53 UTC (rev 9109)
@@ -122,6 +122,17 @@
locale.setlocale(locale.LC_ALL, '')
Raise UnicodeError if unsuccessful.
+
+ Provisional:
+ - Raise UnicodeError (instead of falling back to the locale
+ encoding) if decoding the source with the default encoding (UTF-8)
+ fails and Python is started in `UTF-8 mode`.
+
+ Raise UnicodeError (instead of falling back to "latin1") if both,
+ default and locale encoding, fail.
+
+ - Only remove BOM (U+FEFF ZWNBSP at start of data),
+ no other ZWNBSPs.
"""
if self.encoding and self.encoding.lower() == 'unicode':
assert isinstance(data, str), ('input encoding is "unicode" '
@@ -156,7 +167,8 @@
decoded = str(data, enc, self.error_handler)
self.successful_encoding = enc
# Return decoded, removing BOM and other ZWNBSPs.
- # TODO: only remove BOM (ZWNBSP at start of data, API change).
+ # TODO: only remove BOM (ZWNBSP at start of data)
+ # and only if 'self.encoding' is None. (API change)
return decoded.replace('\ufeff', '')
except (UnicodeError, LookupError) as err:
# keep exception instance for use outside of the "for" loop.
Modified: trunk/docutils/test/test_io.py
===================================================================
--- trunk/docutils/test/test_io.py 2022-07-06 14:00:22 UTC (rev 9108)
+++ trunk/docutils/test/test_io.py 2022-07-08 21:30:53 UTC (rev 9109)
@@ -81,14 +81,13 @@
def test_bom(self):
# Provisional:
- # TODO only remove BOM at start of data
- input = io.StringInput(source=b'\xef\xbb\xbf foo \xef\xbb\xbf bar',
- encoding='utf-8')
- # Assert BOMs are gone.
+ input = io.StringInput(source=b'\xef\xbb\xbf foo \xef\xbb\xbf bar')
+ # Assert BOM is gone.
+ # TODO: only remove BOM (ZWNBSP at start of data)
self.assertEqual(input.read(), ' foo bar')
- # With unicode input:
+ # Unicode input is left unchanged:
input = io.StringInput(source='\ufeff foo \ufeff bar')
- # Assert BOMs are still there.
+ # Assert ZWNBSPs are still there.
self.assertEqual(input.read(), '\ufeff foo \ufeff bar')
def test_coding_slug(self):
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|