|
From: <mi...@us...> - 2023-06-24 19:40:33
|
Revision: 9404
http://sourceforge.net/p/docutils/code/9404
Author: milde
Date: 2023-06-24 19:40:30 +0000 (Sat, 24 Jun 2023)
Log Message:
-----------
Last part of the input encoding changes announced for 0.21.
Do not use the locale encoding as fallback if Python is started in `UTF-8 mode`_.
Stop using "latin1" as second fallback.
Update tests and documentation.
Modified Paths:
--------------
trunk/docutils/HISTORY.txt
trunk/docutils/RELEASE-NOTES.txt
trunk/docutils/docs/api/publisher.txt
trunk/docutils/docutils/io.py
trunk/docutils/test/test_CLI.py
trunk/docutils/test/test_io.py
Modified: trunk/docutils/HISTORY.txt
===================================================================
--- trunk/docutils/HISTORY.txt 2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/HISTORY.txt 2023-06-24 19:40:30 UTC (rev 9404)
@@ -17,10 +17,23 @@
Changes since 0.20.1
====================
+* docutils/io.py
+
+* Simpler and more secure `input encoding`_ default behaviour:
+
+ Do not use the locale encoding as fallback if Python is started in
+ `UTF-8 mode`_. Stop using "latin1" as second fallback.
+
+ Remove BOM (U+FEFF ZWNBSP at start of data) only if the `input_encoding`_
+ configuration setting is None, '', 'utf-8-sig', 'utf-16', or 'utf-32'.
+ Do not remove other ZWNBSPs.
+
+ .. _input encoding: docs/api/publisher.html#encodings
+
* docutils/utils/roman.py
- Update to version `1.4 <https://pypi.org/project/roman/4.1/>`__.
- Fixes feature-requests:#95 (license is now ZPL 2.1).
+ - Update to version `1.4 <https://pypi.org/project/roman/4.1/>`__.
+ Fixes feature-requests:#95 (license is now ZPL 2.1).
* docutils/writers/latex2e/__init__.py
Modified: trunk/docutils/RELEASE-NOTES.txt
===================================================================
--- trunk/docutils/RELEASE-NOTES.txt 2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/RELEASE-NOTES.txt 2023-06-24 19:40:30 UTC (rev 9404)
@@ -79,17 +79,6 @@
`Input encoding`_
-----------------
-* Raise `UnicodeError` (instead of falling back to the locale encoding)
- if decoding the source with the default encoding (UTF-8) fails and
- Python is started in `UTF-8 mode`_. (Docutils 0.21)
-
- Raise `UnicodeError` (instead of falling back to "latin1") if both,
- default and locale encoding, fail. (Docutils 0.21)
-
-* Only remove BOM (U+FEFF ZWNBSP at start of data), no other ZWNBSPs.
- Only remove BOM with `input_encoding`_ values None, '', 'utf-8-sig',
- 'utf-16', and 'utf-32'. (Docutils 0.21)
-
* Change the default input encoding from ``None`` (auto-detect) to
"utf-8" in Docutils 0.22.
@@ -205,7 +194,6 @@
or "vermaning" to avoid errors in future conversions.
.. _front end tools: docs/user/tools.html
-.. _input encoding:
.. _input_encoding: docs/user/config.html#input-encoding
.. _math_output: docs/user/config.html#math-output
.. _UTF-8 mode: https://docs.python.org/3/library/os.html#utf8-mode
@@ -223,6 +211,21 @@
.. _csv-table: docs/ref/rst/directives.html#csv-table
+Release 0.21b (unpublished)
+===========================
+
+* Simpler and more secure `input encoding`_ default behaviour:
+
+ Do not use the locale encoding as fallback if Python is started in
+ `UTF-8 mode`_. Stop using "latin1" as second fallback.
+
+ Remove BOM (U+FEFF ZWNBSP at start of data) only if the `input_encoding`_
+ configuration setting is None, '', 'utf-8-sig', 'utf-16', or 'utf-32'.
+ Do not remove other ZWNBSPs.
+
+ .. _input encoding: docs/api/publisher.html#encodings
+
+
Release 0.20.1 (2023-05-17)
===========================
Modified: trunk/docutils/docs/api/publisher.txt
===================================================================
--- trunk/docutils/docs/api/publisher.txt 2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/docs/api/publisher.txt 2023-06-24 19:40:30 UTC (rev 9404)
@@ -445,25 +445,21 @@
.. important:: Details will change over the next Docutils versions.
See RELEASE-NOTES_
-The default **input encoding** is UTF-8. A different encoding can be
-specified with the `input_encoding`_ setting.
+The **input encoding** can be specified with the `input_encoding`_ setting.
-The encoding of a reStructuredText source can also be given by a
+By default, the input encoding is detected from a
`Unicode byte order mark` (BOM_) or a "magic comment" [#magic-comment]_
-similar to :PEP:`263`. This makes the input encoding both *visible* and
-*changeable* on a per-source basis.
+similar to :PEP:`263`. The fallback is "utf-8".
+The default behaviour differs from Python's `open()`:
-If the encoding is unspecified and decoding with UTF-8 fails, the locale's
-`preferred encoding`_ is used as a fallback (if it maps to a valid codec
-and differs from UTF-8).
+- An `explicit encoding declaration` ((BOM_) or a "magic comment"
+ [#magic-comment]_) in the source takes precedence over
+ the `preferred encoding`_.
+- An optional BOM_ is removed from sources.
-The default behaviour differs from Python's `open()`:
+The default will change to "utf-8" in Docutils 0.22,
+the input encoding detection will be removed in Docutils 1.0.
-- The UTF-8 encoding is tried before the `preferred encoding`_.
- (This is almost sure to fail if the actual source encoding differs.)
-- An `explicit encoding declaration` [#magic-comment]_ in the source
- takes precedence over the `preferred encoding`_.
-- An optional BOM_ is removed from UTF-8 encoded sources.
The default **output encoding** is UTF-8.
A different encoding can be specified with the `output_encoding`_ setting.
Modified: trunk/docutils/docutils/io.py
===================================================================
--- trunk/docutils/docutils/io.py 2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/docutils/io.py 2023-06-24 19:40:30 UTC (rev 9404)
@@ -125,8 +125,8 @@
Return Unicode `str` instances unchanged (nothing to decode).
If `self.encoding` is None, determine encoding from data
- or try UTF-8, locale encoding, and (as last ressort) 'latin-1'.
- The client application should call ``locale.setlocale`` at the
+ or try UTF-8 and the locale's preferred encoding.
+ The client application should call ``locale.setlocale()`` at the
beginning of processing::
locale.setlocale(locale.LC_ALL, '')
@@ -133,16 +133,7 @@
Raise UnicodeError if unsuccessful.
- Provisional:
- - Raise UnicodeError (instead of falling back to the locale
- encoding) if decoding the source with the default encoding (UTF-8)
- fails and Python is started in `UTF-8 mode`.
-
- Raise UnicodeError (instead of falling back to "latin1") if both,
- default and locale encoding, fail.
-
- - Only remove BOM (U+FEFF ZWNBSP at start of data),
- no other ZWNBSPs.
+ Provisional: encoding detection will be removed in Docutils 1.0.
"""
if self.encoding and self.encoding.lower() == 'unicode':
assert isinstance(data, str), ('input encoding is "unicode" '
@@ -157,21 +148,17 @@
else:
data_encoding = self.determine_encoding_from_data(data)
if data_encoding:
- # If the data declares its encoding (explicitly or via a BOM),
- # we believe it.
+ # `data` declares its encoding with "magic comment" or BOM,
encoding_candidates = [data_encoding]
else:
- # Apply heuristics only if no encoding is explicitly given and
- # no BOM found. Start with UTF-8, because that only matches
+ # Apply heuristics if the encoding is not specified.
+ # Start with UTF-8, because that only matches
# data that *IS* UTF-8:
encoding_candidates = ['utf-8']
- # TODO: use `locale.getpreferredlocale(do_setlocale=True)`
- # to respect UTF-8 mode (API change).
- # (Check if it is a valid encoding and not UTF-8)
- if _locale_encoding and _locale_encoding != 'utf-8':
- encoding_candidates.append(_locale_encoding)
- # TODO: don't fall back to 'latin-1' (API change).
- encoding_candidates.append('latin-1')
+ # If UTF-8 fails, fall back to the locale's preferred encoding:
+ fallback = locale.getpreferredencoding(do_setlocale=False)
+ if fallback and fallback.lower() != 'utf-8':
+ encoding_candidates.append(fallback)
for enc in encoding_candidates:
try:
decoded = str(data, enc, self.error_handler)
Modified: trunk/docutils/test/test_CLI.py
===================================================================
--- trunk/docutils/test/test_CLI.py 2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/test/test_CLI.py 2023-06-24 19:40:30 UTC (rev 9404)
@@ -57,7 +57,9 @@
del os.environ['DOCUTILSCONFIG']
sys.stdout = self.orig_stdout
sys.argv = self.orig_argv
- locale.setlocale(locale.LC_ALL, 'C') # restore default (C) locale
+ # restore default locale settings:
+ locale.setlocale(locale.LC_MESSAGES, 'C')
+ locale.setlocale(locale.LC_TIME, 'C')
def get_help_text(self, prog, entry_point):
# call entry_point function and collect help text
Modified: trunk/docutils/test/test_io.py
===================================================================
--- trunk/docutils/test/test_io.py 2023-06-24 19:40:12 UTC (rev 9403)
+++ trunk/docutils/test/test_io.py 2023-06-24 19:40:30 UTC (rev 9404)
@@ -7,9 +7,11 @@
"""
Test module for `docutils.io`.
"""
-import os.path
+import codecs
+import locale
from io import StringIO, BytesIO
+import os.path
from pathlib import Path
import sys
import unittest
@@ -24,7 +26,11 @@
# DATA_ROOT is ./test/data/ from the docutils root
DATA_ROOT = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data')
+# normalize the preferred encoding's name:
+preferredencoding = codecs.lookup(
+ locale.getpreferredencoding(do_setlocale=False)).name
+
# Stub: Buffer with 'strict' auto-conversion of input to byte string:
class BBuf(BytesIO):
def write(self, data):
@@ -129,27 +135,6 @@
""")
self.assertNotEqual(input.successful_encoding, 'ascii')
- def test_readlines(self):
- input = du_io.FileInput(
- source_path=os.path.join(DATA_ROOT, 'include.txt'))
- data = input.readlines()
- self.assertEqual(data, ['Some include text.\n'])
-
- def test_heuristics_no_utf8(self):
- # if no encoding is given and decoding with 'utf-8' fails,
- # use either the locale encoding (if specified) or 'latin-1':
- # Provisional: the second fallback 'latin-1' will be dropped
- probed_encodings = (du_io._locale_encoding, 'latin-1') # noqa
- input = du_io.FileInput(
- source_path=os.path.join(DATA_ROOT, 'latin1.txt'))
- data = input.read()
- if input.successful_encoding not in probed_encodings:
- raise AssertionError(
- "guessed encoding '%s' differs from probed encodings %r"
- % (input.successful_encoding, probed_encodings))
- if input.successful_encoding == 'latin-1':
- self.assertEqual(data, 'Gr\xfc\xdfe\n')
-
def test_decode_unicode(self):
# With the special value "unicode" or "Unicode":
uniinput = du_io.Input(encoding='unicode')
@@ -156,7 +141,8 @@
# keep unicode instances as-is
self.assertEqual(uniinput.decode('ja'), 'ja')
# raise AssertionError if data is not a `str` instance
- self.assertRaises(AssertionError, uniinput.decode, b'ja')
+ with self.assertRaises(AssertionError):
+ uniinput.decode(b'ja')
class OutputTests(unittest.TestCase):
@@ -295,22 +281,27 @@
source_path=os.path.join(DATA_ROOT, 'utf8.txt'))
self.assertEqual(source.read(), 'Grüße\n')
- @unittest.skipIf(du_io._locale_encoding in (None, 'utf-8', 'utf8'),
+ @unittest.skipIf(preferredencoding in (None, 'ascii', 'utf-8'),
'locale encoding not set or UTF-8')
def test_fallback_no_utf8(self):
- # if decoding with 'utf-8' fails, use the locale encoding
- # (if not None) or 'latin-1'.
- # provisional: behaviour details will change in future
- # TODO: don't fall back to latin1
- # TODO: use `locale.getpreferredlocale()` (honour UTF-8 mode)?
- probed_encodings = (du_io._locale_encoding, 'latin-1') # noqa
+ # If no encoding is given and decoding with 'utf-8' fails,
+ # use the locale's preferred encoding (if not None).
+ # Provisional: the default will become 'utf-8'
+ # (without auto-detection and fallback) in Docutils 0.22.
source = du_io.FileInput(
source_path=os.path.join(DATA_ROOT, 'latin1.txt'))
data = source.read()
- self.assertTrue(source.successful_encoding in probed_encodings)
- if source.successful_encoding in ('latin-1', 'iso8859-1'):
+ successful_encoding = codecs.lookup(source.successful_encoding).name
+ self.assertEqual(successful_encoding, preferredencoding)
+ if successful_encoding == 'iso8859-1':
self.assertEqual(data, 'Grüße\n')
+ def test_readlines(self):
+ source = du_io.FileInput(
+ source_path=os.path.join(DATA_ROOT, 'include.txt'))
+ data = source.readlines()
+ self.assertEqual(data, ['Some include text.\n'])
+
if __name__ == '__main__':
unittest.main()
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|