[Docutils-checkins] SF.net SVN: docutils:[9404] trunk/docutils

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 9404
          http://sourceforge.net/p/docutils/code/9404
Author:   milde
Date:     2023-06-24 19:40:30 +0000 (Sat, 24 Jun 2023)
Log Message:
-----------
Last part of the input encoding changes announced for 0.21.

Do not use the locale encoding as fallback if Python is started in `UTF-8 mode`_.
Stop using "latin1" as second fallback.

Update tests and documentation.

Modified Paths:
--------------
    trunk/docutils/HISTORY.txt
    trunk/docutils/RELEASE-NOTES.txt
    trunk/docutils/docs/api/publisher.txt
    trunk/docutils/docutils/io.py
    trunk/docutils/test/test_CLI.py
    trunk/docutils/test/test_io.py

Modified: trunk/docutils/HISTORY.txt
===================================================================

--- trunk/docutils/HISTORY.txt	2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/HISTORY.txt	2023-06-24 19:40:30 UTC (rev 9404)
@@ -17,10 +17,23 @@
 Changes since 0.20.1
 ====================
 
+* docutils/io.py
+
+* Simpler and more secure `input encoding`_ default behaviour:
+
+  Do not use the locale encoding as fallback if Python is started in
+  `UTF-8 mode`_. Stop using "latin1" as second fallback.
+
+  Remove BOM (U+FEFF ZWNBSP at start of data) only if the `input_encoding`_
+  configuration setting is None, '', 'utf-8-sig', 'utf-16', or 'utf-32'.
+  Do not remove other ZWNBSPs.
+
+  .. _input encoding: docs/api/publisher.html#encodings
+
 * docutils/utils/roman.py
 
-  Update to version `1.4 <https://pypi.org/project/roman/4.1/>`__.
-  Fixes feature-requests:#95 (license is now ZPL 2.1).
+  - Update to version `1.4 <https://pypi.org/project/roman/4.1/>`__.
+    Fixes feature-requests:#95 (license is now ZPL 2.1).
 
 * docutils/writers/latex2e/__init__.py
 

Modified: trunk/docutils/RELEASE-NOTES.txt
===================================================================
--- trunk/docutils/RELEASE-NOTES.txt	2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/RELEASE-NOTES.txt	2023-06-24 19:40:30 UTC (rev 9404)
@@ -79,17 +79,6 @@
 `Input encoding`_
 -----------------
 
-* Raise `UnicodeError` (instead of falling back to the locale encoding)
-  if decoding the source with the default encoding (UTF-8) fails and
-  Python is started in `UTF-8 mode`_. (Docutils 0.21)
-
-  Raise `UnicodeError` (instead of falling back to "latin1") if both,
-  default and locale encoding, fail. (Docutils 0.21)
-
-* Only remove BOM (U+FEFF ZWNBSP at start of data), no other ZWNBSPs.
-  Only remove BOM with `input_encoding`_ values None, '', 'utf-8-sig',
-  'utf-16', and 'utf-32'. (Docutils 0.21)
-
 * Change the default input encoding from ``None`` (auto-detect) to
   "utf-8" in Docutils 0.22.
 
@@ -205,7 +194,6 @@
   or "vermaning" to avoid errors in future conversions.
 
 .. _front end tools: docs/user/tools.html
-.. _input encoding:
 .. _input_encoding: docs/user/config.html#input-encoding
 .. _math_output: docs/user/config.html#math-output
 .. _UTF-8 mode: https://docs.python.org/3/library/os.html#utf8-mode
@@ -223,6 +211,21 @@
 .. _csv-table: docs/ref/rst/directives.html#csv-table
 
 
+Release 0.21b (unpublished)
+===========================
+
+* Simpler and more secure `input encoding`_ default behaviour:
+
+  Do not use the locale encoding as fallback if Python is started in
+  `UTF-8 mode`_. Stop using "latin1" as second fallback.
+
+  Remove BOM (U+FEFF ZWNBSP at start of data) only if the `input_encoding`_
+  configuration setting is None, '', 'utf-8-sig', 'utf-16', or 'utf-32'.
+  Do not remove other ZWNBSPs.
+
+  .. _input encoding: docs/api/publisher.html#encodings
+
+
 Release 0.20.1 (2023-05-17)
 ===========================
 

Modified: trunk/docutils/docs/api/publisher.txt
===================================================================
--- trunk/docutils/docs/api/publisher.txt	2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/docs/api/publisher.txt	2023-06-24 19:40:30 UTC (rev 9404)
@@ -445,25 +445,21 @@
 .. important:: Details will change over the next Docutils versions.
    See RELEASE-NOTES_
 
-The default **input encoding** is UTF-8. A different encoding can be
-specified with the `input_encoding`_ setting.
+The **input encoding** can be specified with the `input_encoding`_ setting.
 
-The encoding of a reStructuredText source can also be given by a
+By default, the input encoding is detected from a
 `Unicode byte order mark` (BOM_) or a "magic comment" [#magic-comment]_
-similar to :PEP:`263`. This makes the input encoding both *visible* and
-*changeable* on a per-source basis.
+similar to :PEP:`263`. The fallback is "utf-8".
+The default behaviour differs from Python's `open()`:
 
-If the encoding is unspecified and decoding with UTF-8 fails, the locale's
-`preferred encoding`_ is used as a fallback (if it maps to a valid codec
-and differs from UTF-8).
+- An `explicit encoding declaration` ((BOM_) or a "magic comment"
+  [#magic-comment]_) in the source takes precedence over
+  the `preferred encoding`_.
+- An optional BOM_ is removed from sources.
 
-The default behaviour differs from Python's `open()`:
+The default will change to "utf-8" in Docutils 0.22, 
+the input encoding detection will be removed in Docutils 1.0.
 
-- The UTF-8 encoding is tried before the `preferred encoding`_.
-  (This is almost sure to fail if the actual source encoding differs.)
-- An `explicit encoding declaration` [#magic-comment]_ in the source
-  takes precedence over the `preferred encoding`_.
-- An optional BOM_ is removed from UTF-8 encoded sources.
 
 The default **output encoding** is UTF-8.
 A different encoding can be specified with the `output_encoding`_ setting.

Modified: trunk/docutils/docutils/io.py
===================================================================
--- trunk/docutils/docutils/io.py	2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/docutils/io.py	2023-06-24 19:40:30 UTC (rev 9404)
@@ -125,8 +125,8 @@
         Return Unicode `str` instances unchanged (nothing to decode).
 
         If `self.encoding` is None, determine encoding from data
-        or try UTF-8, locale encoding, and (as last ressort) 'latin-1'.
-        The client application should call ``locale.setlocale`` at the
+        or try UTF-8 and the locale's preferred encoding.
+        The client application should call ``locale.setlocale()`` at the
         beginning of processing::
 
             locale.setlocale(locale.LC_ALL, '')
@@ -133,16 +133,7 @@
 
         Raise UnicodeError if unsuccessful.
 
-        Provisional:
-          - Raise UnicodeError (instead of falling back to the locale
-            encoding) if decoding the source with the default encoding (UTF-8)
-            fails and Python is started in `UTF-8 mode`.
-
-            Raise UnicodeError (instead of falling back to "latin1") if both,
-            default and locale encoding, fail.
-
-          - Only remove BOM (U+FEFF ZWNBSP at start of data),
-            no other ZWNBSPs.
+        Provisional: encoding detection will be removed in Docutils 1.0.
         """
         if self.encoding and self.encoding.lower() == 'unicode':
             assert isinstance(data, str), ('input encoding is "unicode" '
@@ -157,21 +148,17 @@
         else:
             data_encoding = self.determine_encoding_from_data(data)
             if data_encoding:
-                # If the data declares its encoding (explicitly or via a BOM),
-                # we believe it.
+                # `data` declares its encoding with  "magic comment" or BOM,
                 encoding_candidates = [data_encoding]
             else:
-                # Apply heuristics only if no encoding is explicitly given and
-                # no BOM found.  Start with UTF-8, because that only matches
+                # Apply heuristics if the encoding is not specified.
+                # Start with UTF-8, because that only matches
                 # data that *IS* UTF-8:
                 encoding_candidates = ['utf-8']
-                # TODO: use `locale.getpreferredlocale(do_setlocale=True)`
-                # to respect UTF-8 mode (API change).
-                # (Check if it is a valid encoding and not UTF-8)
-                if _locale_encoding and _locale_encoding != 'utf-8':
-                    encoding_candidates.append(_locale_encoding)
-                # TODO: don't fall back to 'latin-1' (API change).
-                encoding_candidates.append('latin-1')
+                # If UTF-8 fails, fall back to the locale's preferred encoding:
+                fallback = locale.getpreferredencoding(do_setlocale=False)
+                if fallback and fallback.lower() != 'utf-8':
+                    encoding_candidates.append(fallback)
         for enc in encoding_candidates:
             try:
                 decoded = str(data, enc, self.error_handler)

Modified: trunk/docutils/test/test_CLI.py
===================================================================
--- trunk/docutils/test/test_CLI.py	2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/test/test_CLI.py	2023-06-24 19:40:30 UTC (rev 9404)
@@ -57,7 +57,9 @@
         del os.environ['DOCUTILSCONFIG']
         sys.stdout = self.orig_stdout
         sys.argv = self.orig_argv
-        locale.setlocale(locale.LC_ALL, 'C')  # restore default (C) locale
+        # restore default locale settings:
+        locale.setlocale(locale.LC_MESSAGES, 'C')
+        locale.setlocale(locale.LC_TIME, 'C')
 
     def get_help_text(self, prog, entry_point):
         # call entry_point function and collect help text

Modified: trunk/docutils/test/test_io.py
===================================================================
--- trunk/docutils/test/test_io.py	2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/test/test_io.py	2023-06-24 19:40:30 UTC (rev 9404)
@@ -7,9 +7,11 @@
 """
 Test module for `docutils.io`.
 """
-import os.path
 
+import codecs
+import locale
 from io import StringIO, BytesIO
+import os.path
 from pathlib import Path
 import sys
 import unittest
@@ -24,7 +26,11 @@
 # DATA_ROOT is ./test/data/ from the docutils root
 DATA_ROOT = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data')
 
+# normalize the preferred encoding's name:
+preferredencoding = codecs.lookup(
+    locale.getpreferredencoding(do_setlocale=False)).name
 
+
 # Stub: Buffer with 'strict' auto-conversion of input to byte string:
 class BBuf(BytesIO):
     def write(self, data):
@@ -129,27 +135,6 @@
 """)
         self.assertNotEqual(input.successful_encoding, 'ascii')
 
-    def test_readlines(self):
-        input = du_io.FileInput(
-            source_path=os.path.join(DATA_ROOT, 'include.txt'))
-        data = input.readlines()
-        self.assertEqual(data, ['Some include text.\n'])
-
-    def test_heuristics_no_utf8(self):
-        # if no encoding is given and decoding with 'utf-8' fails,
-        # use either the locale encoding (if specified) or 'latin-1':
-        # Provisional: the second fallback 'latin-1' will be dropped
-        probed_encodings = (du_io._locale_encoding, 'latin-1')  # noqa
-        input = du_io.FileInput(
-            source_path=os.path.join(DATA_ROOT, 'latin1.txt'))
-        data = input.read()
-        if input.successful_encoding not in probed_encodings:
-            raise AssertionError(
-                "guessed encoding '%s' differs from probed encodings %r"
-                % (input.successful_encoding, probed_encodings))
-        if input.successful_encoding == 'latin-1':
-            self.assertEqual(data, 'Gr\xfc\xdfe\n')
-
     def test_decode_unicode(self):
         # With the special value "unicode" or "Unicode":
         uniinput = du_io.Input(encoding='unicode')
@@ -156,7 +141,8 @@
         # keep unicode instances as-is
         self.assertEqual(uniinput.decode('ja'), 'ja')
         # raise AssertionError if data is not a `str` instance
-        self.assertRaises(AssertionError, uniinput.decode, b'ja')
+        with self.assertRaises(AssertionError):
+            uniinput.decode(b'ja')
 
 
 class OutputTests(unittest.TestCase):
@@ -295,22 +281,27 @@
             source_path=os.path.join(DATA_ROOT, 'utf8.txt'))
         self.assertEqual(source.read(), 'Grüße\n')
 
-    @unittest.skipIf(du_io._locale_encoding in (None, 'utf-8', 'utf8'),
+    @unittest.skipIf(preferredencoding in (None, 'ascii', 'utf-8'),
                      'locale encoding not set or UTF-8')
     def test_fallback_no_utf8(self):
-        # if decoding with 'utf-8' fails, use the locale encoding
-        # (if not None) or 'latin-1'.
-        # provisional: behaviour details will change in future
-        # TODO: don't fall back to latin1
-        # TODO: use `locale.getpreferredlocale()` (honour UTF-8 mode)?
-        probed_encodings = (du_io._locale_encoding, 'latin-1')  # noqa
+        # If  no encoding is given and decoding with 'utf-8' fails,
+        # use the locale's preferred encoding (if not None).
+        # Provisional: the default will become 'utf-8'
+        # (without auto-detection and fallback) in Docutils 0.22.
         source = du_io.FileInput(
             source_path=os.path.join(DATA_ROOT, 'latin1.txt'))
         data = source.read()
-        self.assertTrue(source.successful_encoding in probed_encodings)
-        if source.successful_encoding in ('latin-1', 'iso8859-1'):
+        successful_encoding = codecs.lookup(source.successful_encoding).name
+        self.assertEqual(successful_encoding, preferredencoding)
+        if successful_encoding == 'iso8859-1':
             self.assertEqual(data, 'Grüße\n')
 
+    def test_readlines(self):
+        source = du_io.FileInput(
+            source_path=os.path.join(DATA_ROOT, 'include.txt'))
+        data = source.readlines()
+        self.assertEqual(data, ['Some include text.\n'])
 
+
 if __name__ == '__main__':
     unittest.main()

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.