From: Syn W. <syn...@jt...> - 2001-08-25 00:36:09
|
Proposal for API additions to the class com.ibm.text.UTF16 Expires: September 7th 2001 Please submit comments to this list by the above date. Background: The UTF16 class provides methods to manage the character conversions and indexing conversions between UTF32 and UTF16. Currently, the accessor and modifier APIs takes java.lang.String and java.lang.StringBuffer as the input argument respectively. A couple of convenience methods would be added in this proposal to allow a wider range of input arguments. Additional Methods: 1) /** * Adds a codepoint to the end of the argument char array * @param target char array to be append with the new code point * @param length number of UTF16 characters in the target array, the codepoint will be * appended after the last character. * @param size total number of allocated space in the target char array. Size in terms of * the number of chars. * @param char32 code point to be appended * @return the new target length. * @exception java.lang.ArrayIndexOutOfBoundsException thrown if an attempt is made to * append the codepoint to an out of bounds offset */ public static int append(char[] target, int length, int size, int char32) 2) /** * Returns the type of the boundaries around the char at offset16. Used for random access. * @param source string buffer to analyse * @param offset16 UTF16 offset * @return * <ul> * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, * offset16+1] * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; * the bounds are [offset16, offset16 + 2] * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; * the bounds are [offset16 - 1, offset16 + 1] * <\ul> * For bit-twiddlers, the return values for these are chosen so that the boundaries * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. * @exception java.lang.StringIndexOutOfBoundsException if offset16 is out of bounds. */ public static int bounds(java.lang.StringBuffer source, int offset16) 3) /** * Returns the type of the boundaries around the char at offset16. Used for random access. * @param source char array to analyse * @param start offset to substring in the source array for analyzing * @param end offset to substring in the source array for analyzing * @param offset16 UTF16 offset * @return * <ul> * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, * offset16+1] * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; * the bounds are [offset16, offset16 + 2] * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; * the bounds are [offset16 - 1, offset16 + 1] * <\ul> * For bit-twiddlers, the return values for these are chosen so that the boundaries * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)]. * @exception java.lang.ArrayIndexOutOfBoundsException if offset16 is not within the range * of start and end. */ public static int bounds(char source[], , int start, int end, int offset16) 4) /** * Returns the type of the boundaries around the char at offset32. Used for random access. * @param source string buffer to analyse * @param offset32 UTF32 offset * @return * <ul> * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset32, * offset32+1] * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset32; * the bounds are [offset32, offset32 + 2] * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset32 - 1; * the bounds are [offset32 - 1, offset32 + 1] * <\ul> * For bit-twiddlers, the return values for these are chosen so that the boundaries * can be gotten by: [offset32 - (value >> 2), offset32 + (value & 3)]. * @exception java.lang.StringIndexOutOfBoundsException if offset16 is out of bounds. */ public static int boundsAtCodePointOffset(java.lang.StringBuffer source, int offset32) 5) /** * Returns the type of the boundaries around the char at offset32. Used for random access. * @param source char array to analyse * @param start offset to substring in the source array for analyzing * @param end offset to substring in the source array for analyzing * @param offset32 UTF32 offset * @return * <ul> * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset32, * offset32+1] * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset32; * the bounds are [offset32, offset32 + 2] * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset32 - 1; * the bounds are [offset32 - 1, offset32 + 1] * <\ul> * For bit-twiddlers, the return values for these are chosen so that the boundaries * can be gotten by: [offset32 - (value >> 2), offset32 + (value & 3)]. * @exception java.lang.ArrayIndexOutOfBoundsException if offset16 is not within the range * of start and end. */ public static int boundsAtCodePointOffset(char source[], int length, int offset16) 6) /** * Extract a single UTF-32 value from a string. * Used when iterating forwards or backwards (with * <code>UTF16.getCharCount()</code>, as well as random access. If a validity * check is required, use <code><a href="../UCharacter.html#isLegal(char)"> * UCharacter.isLegal()</a></code> on the return value. * If the char retrieved is part of a surrogate pair, its supplementary * character will be returned. If a complete supplementary character is not * found the incomplete character will be returned * @param source UTF-16 chars string buffer * @param offset16 UTF-16 offset to the start of the character. * @return UTF-32 value for the UTF-32 value that contains the char at * offset16. The boundaries of that codepoint are the same as in * <code>bounds32()</code>. * @exception java.lang.StringIndexOutOfBoundsException thrown if offset16 is * out of bounds. */ public static int charAt(java.lang.StringBuffer source, int offset16) 7) /** * Extract a single UTF-32 value from a substring. * Used when iterating forwards or backwards (with * <code>UTF16.getCharCount()</code>, as well as random access. If a validity * check is required, use <code><a href="../UCharacter.html#isLegal(char)"> * UCharacter.isLegal()</a></code> on the return value. * If the char retrieved is part of a surrogate pair, its supplementary * character will be returned. If a complete supplementary character is not * found the incomplete character will be returned * @param source array of UTF-16 chars * @param start offset to substring in the source array for analyzing * @param end offset to substring in the source array for analyzing * @param offset16 UTF-16 offset to the start of the character. * @return UTF-32 value for the UTF-32 value that contains the char at * offset16. The boundaries of that codepoint are the same as in * <code>bounds32()</code>. * @exception java.lang.ArrayIndexOutOfBoundsException thrown if offset16 is * not within the range of start and end. */ public static int charAt(char source[], int start, int end, int offset16) 8) /** * Extract a single UTF-32 value from a string. * If a validity check is required, use * <code><a href="../UCharacter.html#isLegal(char)"> * UCharacter.isLegal()</a></code> on the return value. * If tbe char retrieved is part of a surrogate pair, its supplementary * character will be returned. If a complete supplementary character is not * found the incomplete character will be returned * @param source string buffer of UTF-16 chars * @param offset32 UTF-32 offset to the start of the character. * @return a single UTF32 value * @return UTF-32 value for the UTF-32 value that contains the char at * offset16. The boundaries of that codepoint are the same as in * <code>bounds32()</code>. * @exception java.lang.StringIndexOutOfBoundsException if offset32 is * out of bounds. */ public static int charAtCodePointOffset(java.lang.StringBuffer source, int offset32) 9) /** * Extract a single UTF-32 value from a substring. * If a validity check is required, use * <code><a href="../UCharacter.html#isLegal(char)"> * UCharacter.isLegal()</a></code> on the return value. * If tbe char retrieved is part of a surrogate pair, its supplementary * character will be returned. If a complete supplementary character is not * found the incomplete character will be returned * @param source array of UTF-16 chars * @param start offset to substring in the source array for analyzing * @param end offset to substring in the source array for analyzing * @param offset32 UTF-32 offset to the start of the character. * @return a single UTF32 value * @return UTF-32 value for the UTF-32 value that contains the char at * offset32. The boundaries of that codepoint are the same as in * <code>bounds32()</code>. * @exception java.lang.ArrayIndexOutOfBoundsException if offset32 is * out of bounds. */ public static int charAtCodePointOffset(char source[], int start, int end, int offset32) 10) /** * Number of codepoints in a UTF16 String buffer * @param source UTF16 string buffer * @return number of codepoint in string */ public static int countCodePoint(StringBuffer source) 11) /** * Number of codepoints in a UTF16 char array substring * @param source UTF16 char array * @param start offset of the substring * @param end offset of the substring * @return number of codepoint in the substring */ public static int countCodePoint(char source[], int start, int end) 12) /** * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or * after the given UTF-16 offset. Used for random access. See the * <a name="_top_">class description</a> for notes on roundtripping.<br> * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then * the UTF-32 offset of the <strong>lead</strong> of the pair is returned. * </i> * <p> * To find the UTF-32 length of a string, use: * <pre> * len32 = countCodePoint(source); * </pre> * </p> * <p> * @param source text to analyse * @param offset16 UTF-16 offset < source text length. * @return UTF-32 offset * @exception java.lang.StringIndexOutOfBoundsException if offset16 is * out of bounds. */ public static int findCodePointOffset(java.lang.StringBuffer source, int offset16) 13) /** * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or * after the given UTF-16 offset. Used for random access. See the * <a name="_top_">class description</a> for notes on roundtripping.<br> * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then * the UTF-32 offset of the <strong>lead</strong> of the pair is returned. * </i> * <p> * To find the UTF-32 length of a substring, use: * <pre> * len32 = countCodePoint(source, start, end); * </pre> * </p> * <p> * @param source text to analyse * @param start offset of the substring * @param end offset of the substring * @param offset16 UTF-16 * @return UTF-32 offset * @exception java.lang.StringIndexOutOfBoundsException if offset16 is * not within the range of start and end. */ public static int findCodePointOffset(char source[], int start, int end, int offset16) 14) /** * Returns the UTF-16 offset that corresponds to a UTF-32 offset. * Used for random access. See the <a name="_top_">class description</a> for * notes on roundtripping. * @param source the UTF-16 string buffer * @param offset32 UTF-32 offset * @return UTF-16 offset * @exception java.lang.StringIndexOutOfBoundsException if offset32 is out of * bounds. */ public static int findOffsetFromCodePoint(java.lang.StringBuffer source, int offset32) 15) /** * Returns the UTF-16 offset that corresponds to a UTF-32 offset. * Used for random access. See the <a name="_top_">class description</a> for * notes on roundtripping. * @param source the UTF-16 char array whose substring is to be analysed * @param start offset of the substring to be analysed * @param end offset of the substring to be analysed * @param offset32 UTF-32 offset * @return UTF-16 offset * @exception java.lang.ArrayIndexOutOfBoundsException if offset32 is out of * bounds. */ public static int findOffsetFromCodePoint(char source[], int start, int end, int offset32) 16) /** * Set a code point into a UTF16 position in a substring. * This method adjusts the character array accordingly if a supplementary * code point is to be replaced by a non-supplementary code point and * vice versa. * @param target char array * @param start offset of the substring * @param end offset of the substring * @param offset16 UTF16 position to insert into * @param char32 code point * @exception java.lang.ArrayIndexOutOfBoundsException if offset16 is out of * range of start and end. */ public static void setCharAt(char target[], int start, int end, int offset16, int char32) 17) /** * Sets a code point into a UTF32 position within a substring. * This method adjusts the character array accordingly if a supplementary * code point is to be replaced by a non-supplementary code point and * vice versa. * @param target char array * @param start offset of the substring * @param end offset of the substring * @param offset32 UTF32 position to insert into * @param char32 code point * @exception java.lang.ArrayIndexOutOfBoundsException if offset32 is out of * bounds. */ public static void setCharAtCodePointOffset(char target[], int start, int end, int offset32, int char32) Syn Wee Quek IBM GCoC, Cupertino, CA, USA |