From: Wolfgang M. M. <wol...@us...> - 2004-03-31 12:33:55
|
Update of /cvsroot/exist/eXist-1.0/src/org/exist/storage/analysis In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23288/src/org/exist/storage/analysis Modified Files: SimpleTokenizer.java Log Message: Text tokenizer got caught in an endless loop. Index: SimpleTokenizer.java =================================================================== RCS file: /cvsroot/exist/eXist-1.0/src/org/exist/storage/analysis/SimpleTokenizer.java,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** SimpleTokenizer.java 2 Oct 2003 12:20:19 -0000 1.6 --- SimpleTokenizer.java 31 Mar 2004 12:22:07 -0000 1.7 *************** *** 43,50 **** char ch = LA(1); int count = 0; ! while (ch != (char) - 1) { if (ch == '\\' && isWildcard(LA(2))) { break; ! } else if (singleCharToken(ch)) { // if this is a single char token and first in the sequence, // consume it --- 43,51 ---- char ch = LA(1); int count = 0; ! while (ch != (char) -1) { if (ch == '\\' && isWildcard(LA(2))) { break; ! } ! if (singleCharToken(ch)) { // if this is a single char token and first in the sequence, // consume it *************** *** 55,59 **** } break; ! } else if (isNonBreakingCharacter(ch) || (allowWildcards && isWildcard(ch))) { token.consumeNext(); consume(); --- 56,60 ---- } break; ! } else if (Character.isLetter(ch) || nonBreakingChar(ch) || (allowWildcards && isWildcard(ch))) { token.consumeNext(); consume(); *************** *** 291,299 **** } - private boolean isNonBreakingCharacter(char ch) { - return Character.isLetter(ch) - && (!singleCharToken(ch)); - } - /** * The code ranges defined here should be interpreted as 1-char --- 292,295 ---- *************** *** 348,365 **** public static void main(String args[]) { ! String t1 = "\u30A8\u31A1\uACFF\u2FAA\u312A\u3045"; String t2 = "é¸å®çè¥ç©¶åºä¸æ³ä»¥é¸ä½éªå¸«åå ¶è¨æ¯ æ¬²è¶ "; String t3 = "문ì ì¬ì© ìì ì¤ë¥ë¥¼ ì°¾ìë´ê¸° ìí´ ê²ì¦ë ì¤êµì´ íì ì¬ê²í íê³ , ë³´ë¤ ì½ê¸° ì½ê² í기 ìí´ ì¸ì´ì ííì ë¤ë¬ëë¤."; ! for(int i = 0; i < t2.length(); i++) { ! char ch = t2.charAt(i); ! System.out.print( ! Integer.toHexString(ch) + ' ' ! ); ! } SimpleTokenizer tokenizer = new SimpleTokenizer(); ! tokenizer.setText(t3); TextToken token = tokenizer.nextToken(true); while(token != null && token.getType() != TextToken.EOF) { ! System.out.println(token.getText()); token = tokenizer.nextToken(true); } --- 344,361 ---- public static void main(String args[]) { ! String t1 = "\u30A8\u30FB\u31A1\uACFF\u2FAA\u312A\u3045"; String t2 = "é¸å®çè¥ç©¶åºä¸æ³ä»¥é¸ä½éªå¸«åå ¶è¨æ¯ æ¬²è¶ "; String t3 = "문ì ì¬ì© ìì ì¤ë¥ë¥¼ ì°¾ìë´ê¸° ìí´ ê²ì¦ë ì¤êµì´ íì ì¬ê²í íê³ , ë³´ë¤ ì½ê¸° ì½ê² í기 ìí´ ì¸ì´ì ííì ë¤ë¬ëë¤."; ! // for(int i = 0; i < t2.length(); i++) { ! // char ch = t2.charAt(i); ! // System.out.print( ! // Integer.toHexString(ch) + ' ' ! // ); ! // } SimpleTokenizer tokenizer = new SimpleTokenizer(); ! tokenizer.setText(t2); TextToken token = tokenizer.nextToken(true); while(token != null && token.getType() != TextToken.EOF) { ! //System.out.println(token.getText()); token = tokenizer.nextToken(true); } |