[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserHelper CompositeTagScannerHelper.java,1.47,1.48
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper In directory sc8-pr-cvs1:/tmp/cvs-serv30684/parserHelper Modified Files: CompositeTagScannerHelper.java ScriptScannerHelper.java Removed Files: AttributeParser.java StringParser.java TagParser.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: CompositeTagScannerHelper.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/CompositeTagScannerHelper.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** CompositeTagScannerHelper.java 22 Sep 2003 02:39:59 -0000 1.47 --- CompositeTagScannerHelper.java 28 Sep 2003 15:33:58 -0000 1.48 *************** *** 29,37 **** package org.htmlparser.parserHelper; import org.htmlparser.Node; ! import org.htmlparser.NodeReader; import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; --- 29,40 ---- package org.htmlparser.parserHelper; + import java.util.Vector; import org.htmlparser.Node; ! import org.htmlparser.lexer.Cursor; ! import org.htmlparser.lexer.Lexer; ! import org.htmlparser.lexer.nodes.Attribute; ! import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; *************** *** 43,49 **** private CompositeTagScanner scanner; private Tag tag; ! private String url; ! private NodeReader reader; ! private String currLine; private Tag endTag; private NodeList nodeList; --- 46,50 ---- private CompositeTagScanner scanner; private Tag tag; ! private Lexer mLexer; private Tag endTag; private NodeList nodeList; *************** *** 56,69 **** CompositeTagScanner scanner, Tag tag, ! String url, ! NodeReader reader, ! String currLine, boolean balance_quotes) { this.scanner = scanner; this.tag = tag; ! this.url = url; ! this.reader = reader; ! this.currLine = currLine; this.endTag = null; this.nodeList = new NodeList(); --- 57,66 ---- CompositeTagScanner scanner, Tag tag, ! Lexer lexer, boolean balance_quotes) { this.scanner = scanner; this.tag = tag; ! mLexer = lexer; this.endTag = null; this.nodeList = new NodeList(); *************** *** 73,77 **** public Tag scan() throws ParserException { ! this.startingLineNumber = reader.getLastLineNumber(); if (shouldCreateEndTagAndExit()) { return createEndTagAndRepositionReader(); --- 70,74 ---- public Tag scan() throws ParserException { ! startingLineNumber = mLexer.getCurrentLineNumber (); if (shouldCreateEndTagAndExit()) { return createEndTagAndRepositionReader(); *************** *** 83,89 **** if (!endTagFound) { do { ! currentNode = reader.readElement(balance_quotes); ! if (currentNode==null) continue; ! currLine = reader.getCurrentLine(); if (currentNode instanceof Tag) doForceCorrectionCheckOn((Tag)currentNode); --- 80,86 ---- if (!endTagFound) { do { ! currentNode = mLexer.nextNode (); // balance_quotes ? ! if (currentNode==null) ! continue; if (currentNode instanceof Tag) doForceCorrectionCheckOn((Tag)currentNode); *************** *** 95,103 **** while (currentNode!=null && !endTagFound); } ! if (endTag==null) { ! createCorrectionEndTagBefore(reader.getLastReadPosition()+1); ! } ! this.endingLineNumber = reader.getLastLineNumber(); return createTag(); } --- 92,99 ---- while (currentNode!=null && !endTagFound); } ! if (endTag==null) ! createCorrectionEndTagBefore (mLexer.getCursor ().getPosition ()); ! endingLineNumber = mLexer.getCurrentLineNumber (); return createTag(); } *************** *** 108,161 **** private Tag createEndTagAndRepositionReader() { ! createCorrectionEndTagBefore(tag.elementBegin()); ! reader.setPosInLine(tag.elementBegin()); ! reader.setDontReadNextLine(true); return endTag; } ! private void createCorrectionEndTagBefore(int pos) { ! String endTagName = tag.getTagName(); ! int endTagBegin = pos ; ! int endTagEnd = endTagBegin + endTagName.length() + 2; ! endTag = new EndTag( ! new TagData( ! endTagBegin, ! endTagEnd, endTagName, ! currLine ! ) ! ); } private void createCorrectionEndTagBefore(Tag possibleEndTagCauser) { ! String endTagName = tag.getTagName(); int endTagBegin = possibleEndTagCauser.elementBegin(); int endTagEnd = endTagBegin + endTagName.length() + 2; possibleEndTagCauser.setTagBegin(endTagEnd+1); ! reader.addNextParsedNode(possibleEndTagCauser); ! endTag = new EndTag( ! new TagData( ! endTagBegin, ! endTagEnd, endTagName, ! currLine ! ) ! ); } ! private Tag createTag() throws ParserException { ! CompositeTag newTag = ! (CompositeTag) ! scanner.createTag( ! new TagData( ! tag.elementBegin(), ! endTag.elementEnd(), ! startingLineNumber, ! endingLineNumber, ! tag.getText(), ! currLine, ! url, ! tag.isEmptyXmlTag() ! ), new CompositeTagData( tag,endTag,nodeList --- 104,156 ---- private Tag createEndTagAndRepositionReader() { ! createCorrectionEndTagBefore (tag.elementBegin ()); ! mLexer.setPosition (tag.elementBegin ()); return endTag; } ! private void createCorrectionEndTagBefore(int position) ! { ! String endTagName = "/" + tag.getTagName(); ! Vector attributes = new Vector (); ! attributes.addElement (new Attribute (endTagName, (String)null, (char)0)); ! TagData data = new TagData( endTagName, ! position, ! attributes, ! mLexer.getPage ().getUrl (), ! false); ! endTag = new Tag (data); } private void createCorrectionEndTagBefore(Tag possibleEndTagCauser) { ! String endTagName = "/" + tag.getTagName(); int endTagBegin = possibleEndTagCauser.elementBegin(); int endTagEnd = endTagBegin + endTagName.length() + 2; possibleEndTagCauser.setTagBegin(endTagEnd+1); ! Vector attributes = new Vector (); ! attributes.addElement (new Attribute (endTagName, (String)null, (char)0)); ! TagData data = new TagData( endTagName, ! endTagBegin, ! attributes, ! mLexer.getPage ().getUrl (), ! false); ! ! endTag = new Tag(data); } ! private Tag createTag() throws ParserException ! { ! TagData data; ! ! data = new TagData( ! mLexer.getPage (), ! tag.elementBegin(), ! endTag.elementEnd(), ! tag.getAttributesEx (), ! mLexer.getPage ().getUrl (), ! tag.isEmptyXmlTag ()); ! ! CompositeTag newTag = (CompositeTag)scanner.createTag (data, new CompositeTagData( tag,endTag,nodeList *************** *** 169,179 **** } ! private void doChildAndEndTagCheckOn(Node currentNode) { ! if (currentNode instanceof EndTag) { ! EndTag possibleEndTag = (EndTag)currentNode; ! if (isExpectedEndTag(possibleEndTag)) { ! endTagFound = true; ! endTag = possibleEndTag; ! return; } } --- 164,182 ---- } ! private void doChildAndEndTagCheckOn(Node currentNode) ! { ! Tag tag; ! ! if (currentNode instanceof Tag) ! { ! tag = (Tag)currentNode; ! if (tag.isEndTag ()) ! { ! if (isExpectedEndTag (tag)) ! { ! endTagFound = true; ! endTag =tag; ! return; ! } } } *************** *** 182,187 **** } ! private boolean isExpectedEndTag(EndTag possibleEndTag) { ! return possibleEndTag.getTagName().equals(tag.getTagName()); } --- 185,191 ---- } ! private boolean isExpectedEndTag (TagNode possibleEndTag) ! { ! return (possibleEndTag.getTagName().equals (tag.getTagName ())); } *************** *** 212,216 **** private boolean isSelfChildTagRecievedIncorrectly(Tag possibleEndTag) { return ( ! !(possibleEndTag instanceof EndTag) && !scanner.isAllowSelfChildren() && possibleEndTag.getTagName().equals(tag.getTagName()) --- 216,220 ---- private boolean isSelfChildTagRecievedIncorrectly(Tag possibleEndTag) { return ( ! !(possibleEndTag.isEndTag ()) && !scanner.isAllowSelfChildren() && possibleEndTag.getTagName().equals(tag.getTagName()) Index: ScriptScannerHelper.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/ScriptScannerHelper.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** ScriptScannerHelper.java 22 Sep 2003 02:39:59 -0000 1.12 --- ScriptScannerHelper.java 28 Sep 2003 15:33:58 -0000 1.13 *************** *** 29,32 **** --- 29,33 ---- import org.htmlparser.*; + import org.htmlparser.lexer.Lexer; import org.htmlparser.scanners.*; import org.htmlparser.tags.*; *************** *** 38,41 **** --- 39,43 ---- public class ScriptScannerHelper { + private Lexer mLexer; private int endTagLoc; private Tag endTag; *************** *** 44,48 **** private boolean sameLine; private boolean endTagFound; - private NodeReader reader; private StringBuffer scriptContents; private ScriptScanner scriptScanner; --- 46,49 ---- *************** *** 51,56 **** private String currLine; ! public ScriptScannerHelper(Tag tag, String url, NodeReader nodeReader, String currLine, ScriptScanner scriptScanner) { ! this.reader = nodeReader; this.scriptScanner = scriptScanner; this.tag = tag; --- 52,58 ---- private String currLine; ! public ScriptScannerHelper(Tag tag, Lexer lexer, ScriptScanner scriptScanner) ! { ! mLexer = lexer; this.scriptScanner = scriptScanner; this.tag = tag; *************** *** 60,64 **** public Tag scan() throws ParserException { ! int startLine = reader.getLastLineNumber(); startTag = tag; extractScriptTagFrom(currLine); --- 62,66 ---- public Tag scan() throws ParserException { ! int startLine = mLexer.getCurrentLineNumber (); startTag = tag; extractScriptTagFrom(currLine); *************** *** 69,84 **** } ! private Tag createScriptTagUsing(String url, String currLine, int startLine) { return scriptScanner.createTag( ! new TagData( ! startTag.elementBegin(), ! endTag.elementEnd(), ! startLine, ! reader.getLastLineNumber(), ! startTag.getText(), ! currLine, ! url, ! false ! ), new CompositeTagData( startTag,endTag,createChildrenNodeList() ) --- 71,89 ---- } ! private Tag createScriptTagUsing(String url, String currLine, int startLine) ! { ! TagData data; ! ! data = new TagData( ! mLexer.getPage (), ! startTag.elementBegin(), ! endTag.elementEnd(), ! startTag.getAttributesEx (), ! mLexer.getPage ().getUrl (), ! startTag.isEmptyXmlTag ()); ! return scriptScanner.createTag( ! data, ! new CompositeTagData( startTag,endTag,createChildrenNodeList() ) *************** *** 100,114 **** private void createScriptEndTag(Tag tag, String currLine) { // If end tag doesn't exist, create one ! String endTagName = tag.getTagName(); ! int endTagBegin = reader.getLastReadPosition()+1 ; ! int endTagEnd = endTagBegin + endTagName.length() + 2; ! endTag = new EndTag( new TagData( - endTagBegin, - endTagEnd, endTagName, ! currLine ! ) ! ); } --- 105,118 ---- private void createScriptEndTag(Tag tag, String currLine) { // If end tag doesn't exist, create one ! String endTagName = "/" + tag.getTagName(); ! int endTagBegin = mLexer.getPosition (); ! endTag = new Tag( new TagData( endTagName, ! endTagBegin, ! null, ! mLexer.getPage ().getUrl (), ! false) ! ); } *************** *** 118,139 **** private void extractScriptTagFrom(String currLine) throws ParserException { ! String line = null; ! scriptContents = new StringBuffer(); ! endTagFound = false; ! ! endTag = null; ! line = currLine; ! sameLine = true; ! startingPos = startTag.elementEnd(); ! do { ! doExtractionOfScriptContentsFrom(line); ! if (!endTagFound) { ! line = reader.getNextLine(); ! startingPos = 0; ! } ! if (sameLine) ! sameLine = false; ! } ! while (line!=null && !endTagFound); } --- 122,144 ---- private void extractScriptTagFrom(String currLine) throws ParserException { ! throw new IllegalStateException ("not implemented"); ! // String line = null; ! // scriptContents = new StringBuffer(); ! // endTagFound = false; ! // ! // endTag = null; ! // line = currLine; ! // sameLine = true; ! // startingPos = startTag.elementEnd(); ! // do { ! // doExtractionOfScriptContentsFrom(line); ! // if (!endTagFound) { ! // line = reader.getNextLine(); ! // startingPos = 0; ! // } ! // if (sameLine) ! // sameLine = false; ! // } ! // while (line!=null && !endTagFound); } *************** *** 163,181 **** private void extractEndTagFrom(String line) throws ParserException { ! endTagFound = true; ! endTag = (EndTag)EndTag.find(line,endTagLoc); ! if (sameLine) ! scriptContents.append( ! getCodeBetweenStartAndEndTags( ! line, ! startTag, ! endTagLoc) ! ); ! else { ! scriptContents.append(Parser.getLineSeparator()); ! scriptContents.append(line.substring(0,endTagLoc)); ! } ! ! reader.setPosInLine(endTag.elementEnd()); } --- 168,187 ---- private void extractEndTagFrom(String line) throws ParserException { ! throw new IllegalStateException ("not implemented"); ! // endTagFound = true; ! // endTag = (EndTag)EndTag.find(line,endTagLoc); ! // if (sameLine) ! // scriptContents.append( ! // getCodeBetweenStartAndEndTags( ! // line, ! // startTag, ! // endTagLoc) ! // ); ! // else { ! // scriptContents.append(Parser.getLineSeparator()); ! // scriptContents.append(line.substring(0,endTagLoc)); ! // } ! // ! // mLexer.setPosition (endTag.elementEnd ()); } --- AttributeParser.java DELETED --- --- StringParser.java DELETED --- --- TagParser.java DELETED --- |