[Htmlparser-cvs] htmlparser/src/org/htmlparser RemarkNodeParser.java,1.27,1.28
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-09-01 19:56:05
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv19769 Modified Files: RemarkNodeParser.java Log Message: Workaround for bug #788746 parser crashes on comments like <!-- foobar --!>. No real solution because the codebase assumes remarks end with -->, so this just avoids the crash, but the toHtml() output will output --!-->, which isn't really correct. Added the test case as RemarkNodeParserTest.testExclamationComment(). Index: RemarkNodeParser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNodeParser.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** RemarkNodeParser.java 24 Aug 2003 21:59:41 -0000 1.27 --- RemarkNodeParser.java 1 Sep 2003 19:55:59 -0000 1.28 *************** *** 30,172 **** public class RemarkNodeParser { ! public final static int REMARK_NODE_BEFORE_PARSING_STATE=0; ! public final static int REMARK_NODE_OPENING_ANGLE_BRACKET_STATE=1; ! public final static int REMARK_NODE_EXCLAMATION_RECEIVED_STATE=2; ! public final static int REMARK_NODE_FIRST_DASH_RECEIVED_STATE=3; ! public final static int REMARK_NODE_ACCEPTING_STATE=4; ! public final static int REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE=5; ! public final static int REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE=6; ! public final static int REMARK_NODE_ACCEPTED_STATE=7; ! public final static int REMARK_NODE_ILLEGAL_STATE=8; ! public final static int REMARK_NODE_FINISHED_PARSING_STATE=2; ! ! /** ! * Locate the remark tag withing the input string, by parsing from the given position ! * @param reader HTML reader to be provided so as to allow reading of next line ! * @param input Input String ! * @param position Position to start parsing from ! */ ! public RemarkNode find(NodeReader reader,String input,int position) ! { ! int state = REMARK_NODE_BEFORE_PARSING_STATE; ! StringBuffer tagContents = new StringBuffer(); ! int tagBegin=0; ! int tagEnd=0; ! int i=position; ! int inputLen = input.length(); ! char ch,prevChar=' '; ! while (i < inputLen && state < REMARK_NODE_ACCEPTED_STATE) ! { ! ch = input.charAt(i); ! if (state == REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE) { ! if (ch == '>') ! { ! state=REMARK_NODE_ACCEPTED_STATE; ! tagEnd=i; ! } else if (ch=='-') { ! tagContents.append(prevChar); ! } else ! { ! // Rollback last 2 characters (assumed same) ! state = REMARK_NODE_ACCEPTING_STATE; ! tagContents.append(prevChar); ! tagContents.append(prevChar); ! } ! } ! if (state==REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE) ! { ! if (ch == '-') ! { ! state=REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE; ! } else ! { ! // Rollback ! state = REMARK_NODE_ACCEPTING_STATE; ! tagContents.append(prevChar); ! } ! } ! if (state==REMARK_NODE_ACCEPTING_STATE) { ! if (ch == '-') { ! state=REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE; ! } /*else ! if (ch == '<') ! { ! state=REMARK_NODE_ILLEGAL_STATE; ! } */ ! } ! if (state==REMARK_NODE_ACCEPTING_STATE) ! { ! // We can append contents now ! tagContents.append(ch); ! } ! ! if (state==REMARK_NODE_FIRST_DASH_RECEIVED_STATE) ! { ! if (ch == '-') { ! state=REMARK_NODE_ACCEPTING_STATE; ! // Do a lookahead and see if the next char is > ! if (input.length()>i+1 && input.charAt(i+1)=='>') { ! state=REMARK_NODE_ACCEPTED_STATE;tagEnd=i+1; ! } ! } ! else state=REMARK_NODE_ILLEGAL_STATE; ! } ! if (state==REMARK_NODE_EXCLAMATION_RECEIVED_STATE) ! { ! if (ch == '-') ! state=REMARK_NODE_FIRST_DASH_RECEIVED_STATE; ! else if (ch == '>') ! { ! state=REMARK_NODE_ACCEPTED_STATE; ! tagEnd=i; ! } ! else state=REMARK_NODE_ILLEGAL_STATE; ! } ! if (state==REMARK_NODE_OPENING_ANGLE_BRACKET_STATE) ! { ! if (ch == '!') ! state=REMARK_NODE_EXCLAMATION_RECEIVED_STATE; ! else state = REMARK_NODE_ILLEGAL_STATE; // This is not a remark tag ! } ! if (state == REMARK_NODE_BEFORE_PARSING_STATE) ! { ! if (ch=='<') { ! // Transition from State 0 to State 1 - Record data till > is encountered ! tagBegin = i; ! state = REMARK_NODE_OPENING_ANGLE_BRACKET_STATE; ! } ! else if (ch!=' ') { ! // Its not a space, hence this is probably a string node, not a remark node ! state = REMARK_NODE_ILLEGAL_STATE; ! } ! } ! // if (state > REMARK_NODE_OPENING_ANGLE_BRACKET_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.length() - 1) ! if (state >=REMARK_NODE_ACCEPTING_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.length() - 1) ! { ! // We need to continue parsing to the next line ! //input = reader.getNextLine(); ! tagContents.append(Parser.getLineSeparator()); ! do { ! input = reader.getNextLine(); ! } ! while (input!=null && input.length()==0); ! if (input!=null) ! inputLen = input.length(); else inputLen=-1; ! i=-1; ! } ! if (state==REMARK_NODE_ILLEGAL_STATE) ! { ! return null; ! } ! i++; ! prevChar = ch; ! } ! if (state==REMARK_NODE_ACCEPTED_STATE) ! return new RemarkNode(tagBegin,tagEnd,tagContents.toString()); ! else ! return null; ! } } --- 30,176 ---- public class RemarkNodeParser { ! public final static int REMARK_NODE_BEFORE_PARSING_STATE=0; ! public final static int REMARK_NODE_OPENING_ANGLE_BRACKET_STATE=1; ! public final static int REMARK_NODE_EXCLAMATION_RECEIVED_STATE=2; ! public final static int REMARK_NODE_FIRST_DASH_RECEIVED_STATE=3; ! public final static int REMARK_NODE_ACCEPTING_STATE=4; ! public final static int REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE=5; ! public final static int REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE=6; ! public final static int REMARK_NODE_ACCEPTED_STATE=7; ! public final static int REMARK_NODE_ILLEGAL_STATE=8; ! public final static int REMARK_NODE_FINISHED_PARSING_STATE=2; ! ! /** ! * Locate the remark tag withing the input string, by parsing from the given position ! * @param reader HTML reader to be provided so as to allow reading of next line ! * @param input Input String ! * @param position Position to start parsing from ! */ ! public RemarkNode find(NodeReader reader,String input,int position) ! { ! int state = REMARK_NODE_BEFORE_PARSING_STATE; ! StringBuffer tagContents = new StringBuffer(); ! int tagBegin=0; ! int tagEnd=0; ! int i=position; ! int inputLen = input.length(); ! char ch,prevChar=' '; ! while (i < inputLen && state < REMARK_NODE_ACCEPTED_STATE) ! { ! ch = input.charAt(i); ! if (state == REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE) { ! if (ch == '>') ! { ! state=REMARK_NODE_ACCEPTED_STATE; ! tagEnd=i; ! } else if (ch=='-') { ! tagContents.append(prevChar); ! } else if (ch=='!') { ! tagContents.append(prevChar); ! tagContents.append(prevChar); ! tagContents.append(ch); ! } else ! { ! // Rollback last 2 characters (assumed same) ! state = REMARK_NODE_ACCEPTING_STATE; ! tagContents.append(prevChar); ! tagContents.append(prevChar); ! } ! } ! if (state==REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE) ! { ! if (ch == '-') ! { ! state=REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE; ! } else ! { ! // Rollback ! state = REMARK_NODE_ACCEPTING_STATE; ! tagContents.append(prevChar); ! } ! } ! if (state==REMARK_NODE_ACCEPTING_STATE) { ! if (ch == '-') { ! state=REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE; ! } /*else ! if (ch == '<') ! { ! state=REMARK_NODE_ILLEGAL_STATE; ! } */ ! } ! if (state==REMARK_NODE_ACCEPTING_STATE) ! { ! // We can append contents now ! tagContents.append(ch); ! } ! ! if (state==REMARK_NODE_FIRST_DASH_RECEIVED_STATE) ! { ! if (ch == '-') { ! state=REMARK_NODE_ACCEPTING_STATE; ! // Do a lookahead and see if the next char is > ! if (input.length()>i+1 && input.charAt(i+1)=='>') { ! state=REMARK_NODE_ACCEPTED_STATE;tagEnd=i+1; ! } ! } ! else state=REMARK_NODE_ILLEGAL_STATE; ! } ! if (state==REMARK_NODE_EXCLAMATION_RECEIVED_STATE) ! { ! if (ch == '-') ! state=REMARK_NODE_FIRST_DASH_RECEIVED_STATE; ! else if (ch == '>') ! { ! state=REMARK_NODE_ACCEPTED_STATE; ! tagEnd=i; ! } ! else state=REMARK_NODE_ILLEGAL_STATE; ! } ! if (state==REMARK_NODE_OPENING_ANGLE_BRACKET_STATE) ! { ! if (ch == '!') ! state=REMARK_NODE_EXCLAMATION_RECEIVED_STATE; ! else state = REMARK_NODE_ILLEGAL_STATE; // This is not a remark tag ! } ! if (state == REMARK_NODE_BEFORE_PARSING_STATE) ! { ! if (ch=='<') { ! // Transition from State 0 to State 1 - Record data till > is encountered ! tagBegin = i; ! state = REMARK_NODE_OPENING_ANGLE_BRACKET_STATE; ! } ! else if (ch!=' ') { ! // Its not a space, hence this is probably a string node, not a remark node ! state = REMARK_NODE_ILLEGAL_STATE; ! } ! } ! // if (state > REMARK_NODE_OPENING_ANGLE_BRACKET_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.length() - 1) ! if (state >=REMARK_NODE_ACCEPTING_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.length() - 1) ! { ! // We need to continue parsing to the next line ! //input = reader.getNextLine(); ! tagContents.append(Parser.getLineSeparator()); ! do { ! input = reader.getNextLine(); ! } ! while (input!=null && input.length()==0); ! if (input!=null) ! inputLen = input.length(); else inputLen=-1; ! i=-1; ! } ! if (state==REMARK_NODE_ILLEGAL_STATE) ! { ! return null; ! } ! i++; ! prevChar = ch; ! } ! if (state==REMARK_NODE_ACCEPTED_STATE) ! return new RemarkNode(tagBegin,tagEnd,tagContents.toString()); ! else ! return null; ! } } |