Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/parserHelper AttributeParser.java,1.36,1.37 Composite

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper
In directory sc8-pr-cvs1:/tmp/cvs-serv31228/parserHelper

Modified Files:
	AttributeParser.java CompositeTagScannerHelper.java 
	ParserHelper.java ScriptScannerHelper.java StringParser.java 
	TagParser.java 
Log Message:
Change tabs to spaces in all source files.

Index: AttributeParser.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/AttributeParser.java,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -d -r1.36 -r1.37
*** AttributeParser.java	24 Aug 2003 21:59:42 -0000	1.36
--- AttributeParser.java	3 Sep 2003 23:36:19 -0000	1.37
***************
*** 52,87 ****

! 	private Hashtable attributeTable;
! 	private String element;
! 	private String name;
! 	private String value;
! 	private String part;
! 	private String empty;
! 	private boolean equal;
! 	private StringTokenizer tokenizer;
! 	private boolean doubleQuote;
! 	private boolean singleQuote;
! 	private boolean ready;
! 	private String currentToken;
! 	private String tokenAccumulator;
! 	/**
! 	* Method to break the tag into pieces.
! 	* @param text All the text within the tag inside &lt; and &gt;.
      * @return A Hastable with elements containing the
! 	* pieces of the tag. The tag-name has the value field set to
! 	* the constant Tag.TAGNAME. In addition the tag-name is
! 	* stored into the Hashtable with the name Tag.TAGNAME
! 	* where the value is the name of the tag.
! 	* Tag parameters without value
! 	* has the value "". Parameters with value are represented
! 	* in the Hastable by a name/value pair.
! 	* As html is case insensitive but Hastable is not are all
! 	* names converted into UPPERCASE to the Hastable
! 	* E.g extract the href values from A-tag's and print them
! 	* <pre>
! 	*
      *    Tag tag;
! 	*    Hashtable h;
! 	*    String tmp;
      *    try {
      *        NodeReader in = new NodeReader(new FileReader(path),2048);
--- 52,87 ----

!     private Hashtable attributeTable;
!     private String element;
!     private String name;
!     private String value;
!     private String part;
!     private String empty;
!     private boolean equal;
!     private StringTokenizer tokenizer;
!     private boolean doubleQuote;
!     private boolean singleQuote;
!     private boolean ready;
!     private String currentToken;
!     private String tokenAccumulator;
!     /**
!     * Method to break the tag into pieces.
!     * @param text All the text within the tag inside &lt; and &gt;.
      * @return A Hastable with elements containing the
!     * pieces of the tag. The tag-name has the value field set to
!     * the constant Tag.TAGNAME. In addition the tag-name is
!     * stored into the Hashtable with the name Tag.TAGNAME
!     * where the value is the name of the tag.
!     * Tag parameters without value
!     * has the value "". Parameters with value are represented
!     * in the Hastable by a name/value pair.
!     * As html is case insensitive but Hastable is not are all
!     * names converted into UPPERCASE to the Hastable
!     * E.g extract the href values from A-tag's and print them
!     * <pre>
!     *
      *    Tag tag;
!     *    Hashtable h;
!     *    String tmp;
      *    try {
      *        NodeReader in = new NodeReader(new FileReader(path),2048);
***************
*** 102,118 ****
      *        ie.printStackTrace();
      *    }
! 	* </pre>
! 	*
! 	*/
     public Hashtable parseAttributes (String text) {
! 		attributeTable = new SpecialHashtable();
! 		part = null;
! 		empty = null;
          name=null;
          value=null;
          element=null;
! 		equal = false;
          delim=DELIMETERS;
! 		tokenizer = new StringTokenizer(text,delim,true);
          while (true) {
              part=getNextPartUsing(delim);
--- 102,118 ----
      *        ie.printStackTrace();
      *    }
!     * </pre>
!     *
!     */
     public Hashtable parseAttributes (String text) {
!         attributeTable = new SpecialHashtable();
!         part = null;
!         empty = null;
          name=null;
          value=null;
          element=null;
!         equal = false;
          delim=DELIMETERS;
!         tokenizer = new StringTokenizer(text,delim,true);
          while (true) {
              part=getNextPartUsing(delim);
***************
*** 127,131 ****
                  }
                  else {
!                    	processInvalidPart();
                      if (!tokenizer.hasMoreTokens ())
                          break;
--- 127,131 ----
                  }
                  else {
!                     processInvalidPart();
                      if (!tokenizer.hasMoreTokens ())
                          break;
***************
*** 160,201 ****
      }

! 	private boolean isValid(String part) {
! 		return part != null && (0 < part.length ());
! 	}

! 	private void process(String part) {
! 		if (name == null) {
! 		    if (!part.substring(0,1).equals(" ")) {
! 		        name = part;
! 		        equal=true;
! 		    }
! 		}
! 		else {
! 		    if (equal){
! 		        if (part.equals("=")) {
! 		            equal=false;
! 		            delim=DELIMETERS_WITHOUT_EQUALS;
                      value=Tag.NOTHING;
! 		        }
! 		        else {
! 		             putDataIntoTable(attributeTable,name,Tag.NULLVALUE,false);
! 		             name=part;
! 		             value=null;
! 		        }
! 		    }
! 		    if (!equal && !part.equals("=")) {
! 		        value=part;
! 		        putDataIntoTable(attributeTable,name,value,false);
! 		        name=null;
! 		        value=null;
! 		    }
! 		}
! 	}

      private String getNextPartUsing(String delimiter) {
! 		tokenAccumulator = null;
! 		doubleQuote = false;
! 		singleQuote = false;
! 		ready = false;
          while (ready == false && tokenizer.hasMoreTokens()) {
              currentToken = tokenizer.nextToken(delimiter);
--- 160,201 ----
      }

!     private boolean isValid(String part) {
!         return part != null && (0 < part.length ());
!     }

!     private void process(String part) {
!         if (name == null) {
!             if (!part.substring(0,1).equals(" ")) {
!                 name = part;
!                 equal=true;
!             }
!         }
!         else {
!             if (equal){
!                 if (part.equals("=")) {
!                     equal=false;
!                     delim=DELIMETERS_WITHOUT_EQUALS;
                      value=Tag.NOTHING;
!                 }
!                 else {
!                      putDataIntoTable(attributeTable,name,Tag.NULLVALUE,false);
!                      name=part;
!                      value=null;
!                 }
!             }
!             if (!equal && !part.equals("=")) {
!                 value=part;
!                 putDataIntoTable(attributeTable,name,value,false);
!                 name=null;
!                 value=null;
!             }
!         }
!     }

      private String getNextPartUsing(String delimiter) {
!         tokenAccumulator = null;
!         doubleQuote = false;
!         singleQuote = false;
!         ready = false;
          while (ready == false && tokenizer.hasMoreTokens()) {
              currentToken = tokenizer.nextToken(delimiter);
***************
*** 210,214 ****
                  tokenAccumulator="";
              } else {
!             	tokenAccumulator = currentToken;
                  ready = isReadyWithNextPart(currentToken);
              }
--- 210,214 ----
                  tokenAccumulator="";
              } else {
!                 tokenAccumulator = currentToken;
                  ready = isReadyWithNextPart(currentToken);
              }
***************
*** 217,256 ****
      }

! 	private boolean isReadyWithNextPart(String currentToken) {
! 		boolean ready = false;
! 		if (isDelimeter(currentToken)) {
! 		    if (currentToken.equals("=")){
! 		        ready=true;
! 		    }
! 		}
! 		else {
! 		    ready=true;
! 		}
! 		return ready;
! 	}

! 	private boolean isDelimeter(String token) {
! 		return delim.indexOf(tokenAccumulator)>=0;
! 	}
! 	
! 	private boolean isCurrentTokenSingleQuote() {
! 		return currentToken.charAt(0)==SINGLE_QUOTE;
! 	}

! 	private boolean isCurrentTokenDoubleQuote() {
! 		return currentToken.charAt(0)==DOUBLE_QUOTE;
! 	}

! 	private void combineTokensInsideSingleOrDoubleQuotes() {
! 		if (doubleQuote && currentToken.charAt(0)==DOUBLE_QUOTE){
! 		    doubleQuote= false;
! 		    ready=true;
! 		} else if (singleQuote && currentToken.charAt(0)==SINGLE_QUOTE) {
! 		    singleQuote=false;
! 		    ready=true;
! 		}else {
! 		    tokenAccumulator += currentToken;
! 		}
! 	}

--- 217,256 ----
      }

!     private boolean isReadyWithNextPart(String currentToken) {
!         boolean ready = false;
!         if (isDelimeter(currentToken)) {
!             if (currentToken.equals("=")){
!                 ready=true;
!             }
!         }
!         else {
!             ready=true;
!         }
!         return ready;
!     }

!     private boolean isDelimeter(String token) {
!         return delim.indexOf(tokenAccumulator)>=0;
!     }
!     
!     private boolean isCurrentTokenSingleQuote() {
!         return currentToken.charAt(0)==SINGLE_QUOTE;
!     }

!     private boolean isCurrentTokenDoubleQuote() {
!         return currentToken.charAt(0)==DOUBLE_QUOTE;
!     }

!     private void combineTokensInsideSingleOrDoubleQuotes() {
!         if (doubleQuote && currentToken.charAt(0)==DOUBLE_QUOTE){
!             doubleQuote= false;
!             ready=true;
!         } else if (singleQuote && currentToken.charAt(0)==SINGLE_QUOTE) {
!             singleQuote=false;
!             ready=true;
!         }else {
!             tokenAccumulator += currentToken;
!         }
!     }

Index: CompositeTagScannerHelper.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/CompositeTagScannerHelper.java,v
retrieving revision 1.43
retrieving revision 1.44
diff -C2 -d -r1.43 -r1.44
*** CompositeTagScannerHelper.java	24 Aug 2003 21:59:42 -0000	1.43
--- CompositeTagScannerHelper.java	3 Sep 2003 23:36:19 -0000	1.44
***************
*** 41,225 ****

  public class CompositeTagScannerHelper {
! 	private CompositeTagScanner scanner;
! 	private Tag tag;
! 	private String url;
! 	private NodeReader reader;
! 	private String currLine;
! 	private Tag endTag;
! 	private NodeList nodeList;
! 	private boolean endTagFound;
! 	private int startingLineNumber;
! 	private int endingLineNumber;
! 	private boolean balance_quotes;
! 	
! 	public CompositeTagScannerHelper(
! 		CompositeTagScanner scanner,
! 		Tag tag, 
! 		String url, 
! 		NodeReader reader,
! 		String currLine,
          boolean balance_quotes) {
! 		
! 		this.scanner = scanner;
! 		this.tag = tag;
! 		this.url = url;
! 		this.reader = reader;
! 		this.currLine = currLine;	
! 		this.endTag = null;
! 		this.nodeList = new NodeList();
! 		this.endTagFound = false;
          this.balance_quotes = balance_quotes;
! 	}

! 	public Tag scan() throws ParserException {
! 		this.startingLineNumber = reader.getLastLineNumber();
! 		if (shouldCreateEndTagAndExit()) {
! 			return createEndTagAndRepositionReader();
! 		}
! 		scanner.beforeScanningStarts();
! 		Node currentNode = tag;
! 	
! 		doEmptyXmlTagCheckOn(currentNode);
! 		if (!endTagFound) { 
! 			do {
! 				currentNode = reader.readElement(balance_quotes);
! 				if (currentNode==null) continue; 
! 				currLine = reader.getCurrentLine();
! 				if (currentNode instanceof Tag) 
! 					doForceCorrectionCheckOn((Tag)currentNode);
! 					
! 				doEmptyXmlTagCheckOn(currentNode);
! 				if (!endTagFound)
! 					doChildAndEndTagCheckOn(currentNode);					
! 			}
! 			while (currentNode!=null && !endTagFound);
! 		}
! 		if (endTag==null) {
! 			createCorrectionEndTagBefore(reader.getLastReadPosition()+1);
! 		}
! 		
! 		this.endingLineNumber = reader.getLastLineNumber();
! 		return createTag();
! 	}

! 	private boolean shouldCreateEndTagAndExit() {
! 		return scanner.shouldCreateEndTagAndExit();
! 	}

! 	private Tag createEndTagAndRepositionReader() {
! 		createCorrectionEndTagBefore(tag.elementBegin());
! 		reader.setPosInLine(tag.elementBegin());
! 		reader.setDontReadNextLine(true);
! 		return endTag;
! 	}

! 	private void createCorrectionEndTagBefore(int pos) {
! 		String endTagName = tag.getTagName();
! 		int endTagBegin = pos ;
! 		int endTagEnd = endTagBegin + endTagName.length() + 2; 
! 		endTag = new EndTag(
! 			new TagData(
! 				endTagBegin,
! 				endTagEnd,
! 				endTagName,
! 				currLine
! 			)
! 		);
! 	}
! 	
! 	private void createCorrectionEndTagBefore(Tag possibleEndTagCauser) {
! 		String endTagName = tag.getTagName();
! 		int endTagBegin = possibleEndTagCauser.elementBegin();
! 		int endTagEnd = endTagBegin + endTagName.length() + 2; 
! 		possibleEndTagCauser.setTagBegin(endTagEnd+1);
! 		reader.addNextParsedNode(possibleEndTagCauser);
! 		endTag = new EndTag(
! 			new TagData(
! 				endTagBegin,
! 				endTagEnd,
! 				endTagName,
! 				currLine
! 			)
! 		);
! 	}

! 	private Tag createTag() throws ParserException {
! 		CompositeTag newTag = 
! 		 	(CompositeTag)
! 		 	scanner.createTag(
! 			new TagData(
! 				tag.elementBegin(),
! 				endTag.elementEnd(),
! 				startingLineNumber,
! 				endingLineNumber,
! 				tag.getText(),
! 				currLine,
! 				url,
! 				tag.isEmptyXmlTag()
! 			),
! 			new CompositeTagData(
! 				tag,endTag,nodeList
! 			)
! 		);
! 		for (int i=0;i<newTag.getChildCount();i++) {
! 			Node child = newTag.childAt(i);
! 			child.setParent(newTag);
! 		}
! 		return newTag;
! 	}

! 	private void doChildAndEndTagCheckOn(Node currentNode) {
! 		if (currentNode instanceof EndTag) {
! 			EndTag possibleEndTag = (EndTag)currentNode;
! 			if (isExpectedEndTag(possibleEndTag)) {
! 				endTagFound = true;
! 				endTag = possibleEndTag;
! 				return;
! 			}
! 		}
! 		nodeList.add(currentNode);
! 		scanner.childNodeEncountered(currentNode);
! 	}

! 	private boolean isExpectedEndTag(EndTag possibleEndTag) {
! 		return possibleEndTag.getTagName().equals(tag.getTagName());
! 	}

! 	private void doEmptyXmlTagCheckOn(Node currentNode) {
! 		if (currentNode instanceof Tag) {
! 			Tag possibleEndTag = (Tag)currentNode;
! 			if (isXmlEndTag(tag)) {
! 				endTag = possibleEndTag;
! 				endTagFound = true;			
! 			} 
! 		}
! 	}

! 	private void doForceCorrectionCheckOn(Tag possibleEndTagCauser) {
! 		if (isEndTagMissing(possibleEndTagCauser)) {
! 			createCorrectionEndTagBefore(possibleEndTagCauser);

! 			endTagFound = true;			
! 		}
! 	}

! 	private boolean isEndTagMissing(Tag possibleEndTag) {
! 		return 
! 			scanner.isTagToBeEndedFor(possibleEndTag) || 
! 			isSelfChildTagRecievedIncorrectly(possibleEndTag);
! 	}

! 	private boolean isSelfChildTagRecievedIncorrectly(Tag possibleEndTag) {
! 		return (
! 			!(possibleEndTag instanceof EndTag) &&
! 			!scanner.isAllowSelfChildren() && 
! 			possibleEndTag.getTagName().equals(tag.getTagName())
! 		);
! 	}
! 	
! 	public boolean isXmlEndTag(Tag tag) {
! 		String tagText = tag.getText();
! 		int lastSlash = tagText.lastIndexOf("/");
! 		return (lastSlash == tagText.length()-1 || tag.isEmptyXmlTag()) && tag.getText().indexOf("://")==-1;
! 	}
  }
--- 41,225 ----

  public class CompositeTagScannerHelper {
!     private CompositeTagScanner scanner;
!     private Tag tag;
!     private String url;
!     private NodeReader reader;
!     private String currLine;
!     private Tag endTag;
!     private NodeList nodeList;
!     private boolean endTagFound;
!     private int startingLineNumber;
!     private int endingLineNumber;
!     private boolean balance_quotes;
!     
!     public CompositeTagScannerHelper(
!         CompositeTagScanner scanner,
!         Tag tag, 
!         String url, 
!         NodeReader reader,
!         String currLine,
          boolean balance_quotes) {
!         
!         this.scanner = scanner;
!         this.tag = tag;
!         this.url = url;
!         this.reader = reader;
!         this.currLine = currLine;   
!         this.endTag = null;
!         this.nodeList = new NodeList();
!         this.endTagFound = false;
          this.balance_quotes = balance_quotes;
!     }

!     public Tag scan() throws ParserException {
!         this.startingLineNumber = reader.getLastLineNumber();
!         if (shouldCreateEndTagAndExit()) {
!             return createEndTagAndRepositionReader();
!         }
!         scanner.beforeScanningStarts();
!         Node currentNode = tag;
!     
!         doEmptyXmlTagCheckOn(currentNode);
!         if (!endTagFound) { 
!             do {
!                 currentNode = reader.readElement(balance_quotes);
!                 if (currentNode==null) continue; 
!                 currLine = reader.getCurrentLine();
!                 if (currentNode instanceof Tag) 
!                     doForceCorrectionCheckOn((Tag)currentNode);
!                     
!                 doEmptyXmlTagCheckOn(currentNode);
!                 if (!endTagFound)
!                     doChildAndEndTagCheckOn(currentNode);                   
!             }
!             while (currentNode!=null && !endTagFound);
!         }
!         if (endTag==null) {
!             createCorrectionEndTagBefore(reader.getLastReadPosition()+1);
!         }
!         
!         this.endingLineNumber = reader.getLastLineNumber();
!         return createTag();
!     }

!     private boolean shouldCreateEndTagAndExit() {
!         return scanner.shouldCreateEndTagAndExit();
!     }

!     private Tag createEndTagAndRepositionReader() {
!         createCorrectionEndTagBefore(tag.elementBegin());
!         reader.setPosInLine(tag.elementBegin());
!         reader.setDontReadNextLine(true);
!         return endTag;
!     }

!     private void createCorrectionEndTagBefore(int pos) {
!         String endTagName = tag.getTagName();
!         int endTagBegin = pos ;
!         int endTagEnd = endTagBegin + endTagName.length() + 2; 
!         endTag = new EndTag(
!             new TagData(
!                 endTagBegin,
!                 endTagEnd,
!                 endTagName,
!                 currLine
!             )
!         );
!     }
!     
!     private void createCorrectionEndTagBefore(Tag possibleEndTagCauser) {
!         String endTagName = tag.getTagName();
!         int endTagBegin = possibleEndTagCauser.elementBegin();
!         int endTagEnd = endTagBegin + endTagName.length() + 2; 
!         possibleEndTagCauser.setTagBegin(endTagEnd+1);
!         reader.addNextParsedNode(possibleEndTagCauser);
!         endTag = new EndTag(
!             new TagData(
!                 endTagBegin,
!                 endTagEnd,
!                 endTagName,
!                 currLine
!             )
!         );
!     }

!     private Tag createTag() throws ParserException {
!         CompositeTag newTag = 
!             (CompositeTag)
!             scanner.createTag(
!             new TagData(
!                 tag.elementBegin(),
!                 endTag.elementEnd(),
!                 startingLineNumber,
!                 endingLineNumber,
!                 tag.getText(),
!                 currLine,
!                 url,
!                 tag.isEmptyXmlTag()
!             ),
!             new CompositeTagData(
!                 tag,endTag,nodeList
!             )
!         );
!         for (int i=0;i<newTag.getChildCount();i++) {
!             Node child = newTag.childAt(i);
!             child.setParent(newTag);
!         }
!         return newTag;
!     }

!     private void doChildAndEndTagCheckOn(Node currentNode) {
!         if (currentNode instanceof EndTag) {
!             EndTag possibleEndTag = (EndTag)currentNode;
!             if (isExpectedEndTag(possibleEndTag)) {
!                 endTagFound = true;
!                 endTag = possibleEndTag;
!                 return;
!             }
!         }
!         nodeList.add(currentNode);
!         scanner.childNodeEncountered(currentNode);
!     }

!     private boolean isExpectedEndTag(EndTag possibleEndTag) {
!         return possibleEndTag.getTagName().equals(tag.getTagName());
!     }

!     private void doEmptyXmlTagCheckOn(Node currentNode) {
!         if (currentNode instanceof Tag) {
!             Tag possibleEndTag = (Tag)currentNode;
!             if (isXmlEndTag(tag)) {
!                 endTag = possibleEndTag;
!                 endTagFound = true;         
!             } 
!         }
!     }

!     private void doForceCorrectionCheckOn(Tag possibleEndTagCauser) {
!         if (isEndTagMissing(possibleEndTagCauser)) {
!             createCorrectionEndTagBefore(possibleEndTagCauser);

!             endTagFound = true;         
!         }
!     }

!     private boolean isEndTagMissing(Tag possibleEndTag) {
!         return 
!             scanner.isTagToBeEndedFor(possibleEndTag) || 
!             isSelfChildTagRecievedIncorrectly(possibleEndTag);
!     }

!     private boolean isSelfChildTagRecievedIncorrectly(Tag possibleEndTag) {
!         return (
!             !(possibleEndTag instanceof EndTag) &&
!             !scanner.isAllowSelfChildren() && 
!             possibleEndTag.getTagName().equals(tag.getTagName())
!         );
!     }
!     
!     public boolean isXmlEndTag(Tag tag) {
!         String tagText = tag.getText();
!         int lastSlash = tagText.lastIndexOf("/");
!         return (lastSlash == tagText.length()-1 || tag.isEmptyXmlTag()) && tag.getText().indexOf("://")==-1;
!     }
  }

Index: ParserHelper.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/ParserHelper.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** ParserHelper.java	24 Aug 2003 21:59:42 -0000	1.13
--- ParserHelper.java	3 Sep 2003 23:36:19 -0000	1.14
***************
*** 42,106 ****
  public class ParserHelper implements Serializable {

! 	public ParserHelper() {
! 		super();
! 	}

! 	/**
! 	 * Opens a connection using the given url.
! 	 * @param url The url to open.
! 	 * @param feedback The ibject to use for messages or <code>null</code>.
! 	 * @exception ParserException if an i/o exception occurs accessing the url.
! 	 */
! 	public static URLConnection openConnection (URL url, ParserFeedback feedback)
! 	    throws
! 	        ParserException
! 	{
! 	    URLConnection ret;
! 	    
! 	    try
! 	    {
! 	        ret = url.openConnection ();
! 	    }
! 	    catch (IOException ioe)
! 	    {
! 	        String msg = "HTMLParser.openConnection() : Error in opening a connection to " + url.toExternalForm ();
! 	        ParserException ex = new ParserException (msg, ioe);
! 	        if (null != feedback)
! 	            feedback.error (msg, ex);
! 	        throw ex;
! 	    }
! 	    
! 	    return (ret);
! 	}

! 	/**
! 	 * Opens a connection based on a given string.
! 	 * The string is either a file, in which case <code>file://localhost</code>
! 	 * is prepended to a canonical path derived from the string, or a url that
! 	 * begins with one of the known protocol strings, i.e. <code>http://</code>.
       * Embedded spaces are silently converted to %20 sequences.
! 	 * @param string The name of a file or a url.
! 	 * @param feedback The object to use for messages or <code>null</code> for no feedback.
! 	 * @exception ParserException if the string is not a valid url or file.
! 	 */
! 	public static URLConnection openConnection (String string, ParserFeedback feedback)
! 	    throws
! 	        ParserException
! 	{
! 	    final String prefix = "file://localhost";
! 	    String resource;
! 	    URL url;
! 	    StringBuffer buffer;
! 	    URLConnection ret;
! 	
! 	    try
! 	    {
! 	        url = new URL (LinkProcessor.fixSpaces (string));
! 	        ret =  ParserHelper.openConnection (url, feedback);
! 	    }
! 	    catch (MalformedURLException murle)
! 	    {   // try it as a file
! 	        try
! 	        {
                  File file = new File (string);
                  resource = file.getCanonicalPath ();
--- 42,106 ----
  public class ParserHelper implements Serializable {

!     public ParserHelper() {
!         super();
!     }

!     /**
!      * Opens a connection using the given url.
!      * @param url The url to open.
!      * @param feedback The ibject to use for messages or <code>null</code>.
!      * @exception ParserException if an i/o exception occurs accessing the url.
!      */
!     public static URLConnection openConnection (URL url, ParserFeedback feedback)
!         throws
!             ParserException
!     {
!         URLConnection ret;
!         
!         try
!         {
!             ret = url.openConnection ();
!         }
!         catch (IOException ioe)
!         {
!             String msg = "HTMLParser.openConnection() : Error in opening a connection to " + url.toExternalForm ();
!             ParserException ex = new ParserException (msg, ioe);
!             if (null != feedback)
!                 feedback.error (msg, ex);
!             throw ex;
!         }
!         
!         return (ret);
!     }

!     /**
!      * Opens a connection based on a given string.
!      * The string is either a file, in which case <code>file://localhost</code>
!      * is prepended to a canonical path derived from the string, or a url that
!      * begins with one of the known protocol strings, i.e. <code>http://</code>.
       * Embedded spaces are silently converted to %20 sequences.
!      * @param string The name of a file or a url.
!      * @param feedback The object to use for messages or <code>null</code> for no feedback.
!      * @exception ParserException if the string is not a valid url or file.
!      */
!     public static URLConnection openConnection (String string, ParserFeedback feedback)
!         throws
!             ParserException
!     {
!         final String prefix = "file://localhost";
!         String resource;
!         URL url;
!         StringBuffer buffer;
!         URLConnection ret;
!     
!         try
!         {
!             url = new URL (LinkProcessor.fixSpaces (string));
!             ret =  ParserHelper.openConnection (url, feedback);
!         }
!         catch (MalformedURLException murle)
!         {   // try it as a file
!             try
!             {
                  File file = new File (string);
                  resource = file.getCanonicalPath ();
***************
*** 110,189 ****
                      buffer.append ("/");
                  buffer.append (resource);
! 	            url = new URL (LinkProcessor.fixSpaces (buffer.toString ()));
! 	            ret = ParserHelper.openConnection (url, feedback);
! 	            if (null != feedback)
! 	                feedback.info (url.toExternalForm ());
! 	        }
! 	        catch (MalformedURLException murle2)
! 	        {
! 	            String msg = "HTMLParser.openConnection() : Error in opening a connection to " + string;
! 	            ParserException ex = new ParserException (msg, murle2);
! 	            if (null != feedback)
! 	                feedback.error (msg, ex);
! 	            throw ex;
! 	        }
! 	        catch (IOException ioe)
              {
! 	            String msg = "HTMLParser.openConnection() : Error in opening a connection to " + string;
! 	            ParserException ex = new ParserException (msg, ioe);
! 	            if (null != feedback)
! 	                feedback.error (msg, ex);
! 	            throw ex;
              }
! 	    }
! 	    
! 	    return (ret);
! 	}

! 	/**
! 	 * Lookup a character set name.
! 	 * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em>
! 	 * This uses reflection so the code will still run under prior JDK's but
! 	 * in that case the default is always returned.
! 	 * @param name The name to look up. One of the aliases for a character set.
! 	 * @param _default The name to return if the lookup fails.
! 	 */
! 	public static String findCharset (String name, String _default)
! 	{
! 		String ret;

! 		try
! 		{
! 			Class cls;
! 			java.lang.reflect.Method method;
! 			Object object;

! 			cls = Class.forName ("java.nio.charset.Charset");
! 			method = cls.getMethod ("forName", new Class[] { String.class });
! 			object = method.invoke (null, new Object[] { name });
! 			method = cls.getMethod ("name", new Class[] { });
! 			object = method.invoke (object, new Object[] { });
! 			ret = (String)object;
! 		}
! 		catch (ClassNotFoundException cnfe)
! 		{
! 			// for reflection exceptions, assume the name is correct
! 			ret = name;
! 		}
! 		catch (NoSuchMethodException nsme)
! 		{
! 			// for reflection exceptions, assume the name is correct
! 			ret = name;
! 		}
! 		catch (IllegalAccessException ia)
! 		{
! 			// for reflection exceptions, assume the name is correct
! 			ret = name;
! 		}
! 		catch (java.lang.reflect.InvocationTargetException ita)
! 		{
! 			// java.nio.charset.IllegalCharsetNameException
! 			// and java.nio.charset.UnsupportedCharsetException
! 			// return the default
! 			ret = _default;
! 		}

! 		return (ret);
! 	}

  }
--- 110,189 ----
                      buffer.append ("/");
                  buffer.append (resource);
!                 url = new URL (LinkProcessor.fixSpaces (buffer.toString ()));
!                 ret = ParserHelper.openConnection (url, feedback);
!                 if (null != feedback)
!                     feedback.info (url.toExternalForm ());
!             }
!             catch (MalformedURLException murle2)
              {
!                 String msg = "HTMLParser.openConnection() : Error in opening a connection to " + string;
!                 ParserException ex = new ParserException (msg, murle2);
!                 if (null != feedback)
!                     feedback.error (msg, ex);
!                 throw ex;
              }
!             catch (IOException ioe)
!             {
!                 String msg = "HTMLParser.openConnection() : Error in opening a connection to " + string;
!                 ParserException ex = new ParserException (msg, ioe);
!                 if (null != feedback)
!                     feedback.error (msg, ex);
!                 throw ex;
!             }
!         }
!         
!         return (ret);
!     }

!     /**
!      * Lookup a character set name.
!      * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em>
!      * This uses reflection so the code will still run under prior JDK's but
!      * in that case the default is always returned.
!      * @param name The name to look up. One of the aliases for a character set.
!      * @param _default The name to return if the lookup fails.
!      */
!     public static String findCharset (String name, String _default)
!     {
!         String ret;

!         try
!         {
!             Class cls;
!             java.lang.reflect.Method method;
!             Object object;

!             cls = Class.forName ("java.nio.charset.Charset");
!             method = cls.getMethod ("forName", new Class[] { String.class });
!             object = method.invoke (null, new Object[] { name });
!             method = cls.getMethod ("name", new Class[] { });
!             object = method.invoke (object, new Object[] { });
!             ret = (String)object;
!         }
!         catch (ClassNotFoundException cnfe)
!         {
!             // for reflection exceptions, assume the name is correct
!             ret = name;
!         }
!         catch (NoSuchMethodException nsme)
!         {
!             // for reflection exceptions, assume the name is correct
!             ret = name;
!         }
!         catch (IllegalAccessException ia)
!         {
!             // for reflection exceptions, assume the name is correct
!             ret = name;
!         }
!         catch (java.lang.reflect.InvocationTargetException ita)
!         {
!             // java.nio.charset.IllegalCharsetNameException
!             // and java.nio.charset.UnsupportedCharsetException
!             // return the default
!             ret = _default;
!         }

!         return (ret);
!     }

  }

Index: ScriptScannerHelper.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/ScriptScannerHelper.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** ScriptScannerHelper.java	24 Aug 2003 21:59:42 -0000	1.8
--- ScriptScannerHelper.java	3 Sep 2003 23:36:19 -0000	1.9
***************
*** 38,214 ****
  public class ScriptScannerHelper {

! 	private int endTagLoc;
! 	private Tag endTag;
! 	private Tag startTag;
! 	private int startingPos;
! 	private boolean sameLine;
! 	private boolean endTagFound;
! 	private NodeReader reader;
! 	private StringBuffer scriptContents;
! 	private ScriptScanner scriptScanner;
! 	private Tag tag;
! 	private String url;
! 	private String currLine;
! 	
! 	public ScriptScannerHelper(Tag tag, String url, NodeReader nodeReader, String currLine, ScriptScanner scriptScanner) {
! 		this.reader = nodeReader;
! 		this.scriptScanner = scriptScanner;
! 		this.tag = tag;
! 		this.url = url;
! 		this.currLine = currLine;
! 	}

! 	public Tag scan() throws ParserException {
! 		int startLine = reader.getLastLineNumber();
! 		startTag = tag;
! 		extractScriptTagFrom(currLine);
! 		if (isScriptEndTagNotFound()) {
! 			createScriptEndTag(tag, currLine);
! 		}
! 		return createScriptTagUsing(url, currLine, startLine);
! 	}
! 	
! 	private Tag createScriptTagUsing(String url, String currLine, int startLine) {
! 		return scriptScanner.createTag(
! 			new TagData(
! 				startTag.elementBegin(),
! 				endTag.elementEnd(),
! 				startLine,
! 				reader.getLastLineNumber(),
! 				startTag.getText(),
! 				currLine,
! 				url,
! 				false
! 			), new CompositeTagData(
! 				startTag,endTag,createChildrenNodeList()
! 			)
! 		);
! 	}

! 	private NodeList createChildrenNodeList() {
! 		NodeList childrenNodeList = new NodeList();
! 		childrenNodeList.add(
! 			new StringNode(
! 				scriptContents,
! 				startTag.elementEnd(),
! 				endTag.elementBegin()-1
! 			)
! 		);
! 		return childrenNodeList;
! 	}

! 	private void createScriptEndTag(Tag tag, String currLine) {
! 		// If end tag doesn't exist, create one
! 		String endTagName = tag.getTagName();
! 		int endTagBegin = reader.getLastReadPosition()+1 ;
! 		int endTagEnd = endTagBegin + endTagName.length() + 2; 
! 		endTag = new EndTag(
! 			new TagData(
! 				endTagBegin,
! 				endTagEnd,
! 				endTagName,
! 				currLine
! 			)
! 		);
! 	}

! 	private boolean isScriptEndTagNotFound() {
! 		return endTag == null;
! 	}

! 	private void extractScriptTagFrom(String currLine) throws ParserException {
! 		String line = null;
! 		scriptContents = new StringBuffer();
! 		endTagFound = false;
! 		
! 		endTag = null;
! 		line = currLine;
! 		sameLine = true;
! 		startingPos = startTag.elementEnd();
! 		do {
! 			doExtractionOfScriptContentsFrom(line);
! 			if (!endTagFound) {
! 				line = reader.getNextLine();
! 				startingPos = 0;
! 			}
! 			if (sameLine) 
! 				sameLine = false;
! 		}
! 		while (line!=null && !endTagFound);
! 	}

! 	private void doExtractionOfScriptContentsFrom(String line) throws ParserException {
! 		endTagLoc = line.toUpperCase().indexOf(scriptScanner.getEndTag(),startingPos);
! 		findStartingAndEndingLocations(line);
! 		
! 		if (endTagLoc!=-1) {
! 			extractEndTagFrom(line);
! 		} else {
! 			continueParsing(line);
! 		}
! 	}

! 	private void continueParsing(String line) {
! 		if (sameLine) 
! 			scriptContents.append(
! 				line.substring(
! 					startTag.elementEnd()+1
! 				)
! 			);
! 		else {
! 			scriptContents.append(Parser.getLineSeparator());
! 			scriptContents.append(line);
! 		}
! 	}

! 	private void extractEndTagFrom(String line) throws ParserException {
! 		endTagFound = true;
! 		endTag = (EndTag)EndTag.find(line,endTagLoc);
! 		if (sameLine) 
! 			scriptContents.append(
! 				getCodeBetweenStartAndEndTags(
! 					line,
! 					startTag,
! 					endTagLoc)
! 			);
! 		else {
! 			scriptContents.append(Parser.getLineSeparator());
! 			scriptContents.append(line.substring(0,endTagLoc));
! 		}
! 		
! 		reader.setPosInLine(endTag.elementEnd());
! 	}

! 	private void findStartingAndEndingLocations(String line) {
! 		while (endTagLoc>0 && isThisEndTagLocationFalseMatch(line, endTagLoc)) {
! 			startingPos = endTagLoc+scriptScanner.getEndTag().length();
! 			endTagLoc = line.toUpperCase().indexOf(scriptScanner.getEndTag(), startingPos); 	
! 		}
! 	}

! 	public String getCodeBetweenStartAndEndTags(
! 		String line,
! 		Tag startTag,
! 		int endTagLoc) throws ParserException {
! 		try {
! 			
! 			return line.substring(
! 				startTag.elementEnd()+1,
! 				endTagLoc
! 			);
! 		}
! 		catch (Exception e) {
! 			StringBuffer msg = new StringBuffer("Error in getCodeBetweenStartAndEndTags():\n");
! 			msg.append("substring starts at: "+(startTag.elementEnd()+1)).append("\n");
! 			msg.append("substring ends at: "+(endTagLoc));
! 			throw new ParserException(msg.toString(),e);
! 		}
! 	}

! 	private boolean isThisEndTagLocationFalseMatch(String line, int endTagLoc) {
! 		if (endTagLoc+scriptScanner.getEndTag().length() > line.length()-1) return false;
! 		char charAfterSuspectedEndTag = 
! 			line.charAt(endTagLoc+scriptScanner.getEndTag().length()); 
! 		return charAfterSuspectedEndTag=='"' || charAfterSuspectedEndTag=='\'';
! 	}
  }
--- 38,214 ----
  public class ScriptScannerHelper {

!     private int endTagLoc;
!     private Tag endTag;
!     private Tag startTag;
!     private int startingPos;
!     private boolean sameLine;
!     private boolean endTagFound;
!     private NodeReader reader;
!     private StringBuffer scriptContents;
!     private ScriptScanner scriptScanner;
!     private Tag tag;
!     private String url;
!     private String currLine;
!     
!     public ScriptScannerHelper(Tag tag, String url, NodeReader nodeReader, String currLine, ScriptScanner scriptScanner) {
!         this.reader = nodeReader;
!         this.scriptScanner = scriptScanner;
!         this.tag = tag;
!         this.url = url;
!         this.currLine = currLine;
!     }

!     public Tag scan() throws ParserException {
!         int startLine = reader.getLastLineNumber();
!         startTag = tag;
!         extractScriptTagFrom(currLine);
!         if (isScriptEndTagNotFound()) {
!             createScriptEndTag(tag, currLine);
!         }
!         return createScriptTagUsing(url, currLine, startLine);
!     }
!     
!     private Tag createScriptTagUsing(String url, String currLine, int startLine) {
!         return scriptScanner.createTag(
!             new TagData(
!                 startTag.elementBegin(),
!                 endTag.elementEnd(),
!                 startLine,
!                 reader.getLastLineNumber(),
!                 startTag.getText(),
!                 currLine,
!                 url,
!                 false
!             ), new CompositeTagData(
!                 startTag,endTag,createChildrenNodeList()
!             )
!         );
!     }

!     private NodeList createChildrenNodeList() {
!         NodeList childrenNodeList = new NodeList();
!         childrenNodeList.add(
!             new StringNode(
!                 scriptContents,
!                 startTag.elementEnd(),
!                 endTag.elementBegin()-1
!             )
!         );
!         return childrenNodeList;
!     }

!     private void createScriptEndTag(Tag tag, String currLine) {
!         // If end tag doesn't exist, create one
!         String endTagName = tag.getTagName();
!         int endTagBegin = reader.getLastReadPosition()+1 ;
!         int endTagEnd = endTagBegin + endTagName.length() + 2; 
!         endTag = new EndTag(
!             new TagData(
!                 endTagBegin,
!                 endTagEnd,
!                 endTagName,
!                 currLine
!             )
!         );
!     }

!     private boolean isScriptEndTagNotFound() {
!         return endTag == null;
!     }

!     private void extractScriptTagFrom(String currLine) throws ParserException {
!         String line = null;
!         scriptContents = new StringBuffer();
!         endTagFound = false;
!         
!         endTag = null;
!         line = currLine;
!         sameLine = true;
!         startingPos = startTag.elementEnd();
!         do {
!             doExtractionOfScriptContentsFrom(line);
!             if (!endTagFound) {
!                 line = reader.getNextLine();
!                 startingPos = 0;
!             }
!             if (sameLine) 
!                 sameLine = false;
!         }
!         while (line!=null && !endTagFound);
!     }

!     private void doExtractionOfScriptContentsFrom(String line) throws ParserException {
!         endTagLoc = line.toUpperCase().indexOf(scriptScanner.getEndTag(),startingPos);
!         findStartingAndEndingLocations(line);
!         
!         if (endTagLoc!=-1) {
!             extractEndTagFrom(line);
!         } else {
!             continueParsing(line);
!         }
!     }

!     private void continueParsing(String line) {
!         if (sameLine) 
!             scriptContents.append(
!                 line.substring(
!                     startTag.elementEnd()+1
!                 )
!             );
!         else {
!             scriptContents.append(Parser.getLineSeparator());
!             scriptContents.append(line);
!         }
!     }

!     private void extractEndTagFrom(String line) throws ParserException {
!         endTagFound = true;
!         endTag = (EndTag)EndTag.find(line,endTagLoc);
!         if (sameLine) 
!             scriptContents.append(
!                 getCodeBetweenStartAndEndTags(
!                     line,
!                     startTag,
!                     endTagLoc)
!             );
!         else {
!             scriptContents.append(Parser.getLineSeparator());
!             scriptContents.append(line.substring(0,endTagLoc));
!         }
!         
!         reader.setPosInLine(endTag.elementEnd());
!     }

!     private void findStartingAndEndingLocations(String line) {
!         while (endTagLoc>0 && isThisEndTagLocationFalseMatch(line, endTagLoc)) {
!             startingPos = endTagLoc+scriptScanner.getEndTag().length();
!             endTagLoc = line.toUpperCase().indexOf(scriptScanner.getEndTag(), startingPos);     
!         }
!     }

!     public String getCodeBetweenStartAndEndTags(
!         String line,
!         Tag startTag,
!         int endTagLoc) throws ParserException {
!         try {
!             
!             return line.substring(
!                 startTag.elementEnd()+1,
!                 endTagLoc
!             );
!         }
!         catch (Exception e) {
!             StringBuffer msg = new StringBuffer("Error in getCodeBetweenStartAndEndTags():\n");
!             msg.append("substring starts at: "+(startTag.elementEnd()+1)).append("\n");
!             msg.append("substring ends at: "+(endTagLoc));
!             throw new ParserException(msg.toString(),e);
!         }
!     }

!     private boolean isThisEndTagLocationFalseMatch(String line, int endTagLoc) {
!         if (endTagLoc+scriptScanner.getEndTag().length() > line.length()-1) return false;
!         char charAfterSuspectedEndTag = 
!             line.charAt(endTagLoc+scriptScanner.getEndTag().length()); 
!         return charAfterSuspectedEndTag=='"' || charAfterSuspectedEndTag=='\'';
!     }
  }

Index: StringParser.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/StringParser.java,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -d -r1.36 -r1.37
*** StringParser.java	24 Aug 2003 21:59:42 -0000	1.36
--- StringParser.java	3 Sep 2003 23:36:19 -0000	1.37
***************
*** 34,41 ****

  public class StringParser {
! 	private final static int BEFORE_PARSE_BEGINS_STATE=0;	
! 	private final static int PARSE_HAS_BEGUN_STATE=1;
! 	private final static int PARSE_COMPLETED_STATE=2;	
! 	private final static int PARSE_IGNORE_STATE=3;

      /**
--- 34,41 ----

  public class StringParser {
!     private final static int BEFORE_PARSE_BEGINS_STATE=0;   
!     private final static int PARSE_HAS_BEGUN_STATE=1;
!     private final static int PARSE_COMPLETED_STATE=2;   
!     private final static int PARSE_IGNORE_STATE=3;

      /**
***************
*** 65,89 ****
      }

! 	/**
! 	 * Locate the StringNode within the input string, by parsing from the given position
! 	 * @param reader HTML reader to be provided so as to allow reading of next line
! 	 * @param input Input String
! 	 * @param position Position to start parsing from
! 	 * @param balance_quotes If <code>true</code> enter ignoring state on
       * encountering quotes.
! 	 */		
! 	public Node find(NodeReader reader,String input,int position, boolean balance_quotes)
! 	{
! 		StringBuffer textBuffer = new StringBuffer();
! 		int state = BEFORE_PARSE_BEGINS_STATE;
! 		int textBegin=position;
! 		int textEnd=position;
! 		int inputLen = input.length();
! 		char ch;
          char ignore_ender = '\"';
! 		for (int i=position;(i<inputLen && state!=PARSE_COMPLETED_STATE);i++)
! 		{
! 			ch  = input.charAt(i);
! 			if (ch=='<' && state!=PARSE_IGNORE_STATE)
              {
                  if (beginTag (input, i))
--- 65,89 ----
      }

!     /**
!      * Locate the StringNode within the input string, by parsing from the given position
!      * @param reader HTML reader to be provided so as to allow reading of next line
!      * @param input Input String
!      * @param position Position to start parsing from
!      * @param balance_quotes If <code>true</code> enter ignoring state on
       * encountering quotes.
!      */     
!     public Node find(NodeReader reader,String input,int position, boolean balance_quotes)
!     {
!         StringBuffer textBuffer = new StringBuffer();
!         int state = BEFORE_PARSE_BEGINS_STATE;
!         int textBegin=position;
!         int textEnd=position;
!         int inputLen = input.length();
!         char ch;
          char ignore_ender = '\"';
!         for (int i=position;(i<inputLen && state!=PARSE_COMPLETED_STATE);i++)
!         {
!             ch  = input.charAt(i);
!             if (ch=='<' && state!=PARSE_IGNORE_STATE)
              {
                  if (beginTag (input, i))
***************
*** 92,142 ****
                      textEnd=i-1;
                  }
! 			}
! 			if (balance_quotes && (ch=='\'' || ch=='"'))
              {
! 				if (state==PARSE_IGNORE_STATE)
                  {
                      if (ch == ignore_ender)
                          state=PARSE_HAS_BEGUN_STATE;
                  }
! 				else
                  {
                      ignore_ender = ch;
                      state = PARSE_IGNORE_STATE;
! 				}
! 			}					
! 			if (state==BEFORE_PARSE_BEGINS_STATE)
! 			{
! 				state=PARSE_HAS_BEGUN_STATE;
! 			}
! 			if (state==PARSE_HAS_BEGUN_STATE || state==PARSE_IGNORE_STATE)
! 			{
! 				textBuffer.append(input.charAt(i));
! 			}				
! 			// Patch by Cedric Rosa
! 			if (state==BEFORE_PARSE_BEGINS_STATE && i==inputLen-1)
! 			   state=PARSE_HAS_BEGUN_STATE;
! 			if (state==PARSE_HAS_BEGUN_STATE && i==inputLen-1)
! 			{
! 				do {
! 					input = reader.getNextLine();
! 					if (input!=null && input.length()==0)
! 						textBuffer.append(Parser.getLineSeparator());
! 				}
! 				while (input!=null && input.length()==0);
! 				
! 				if (input==null) {
! 					textEnd=i;
! 					state =PARSE_COMPLETED_STATE;
! 					
! 				} else {
! 					textBuffer.append(Parser.getLineSeparator());
! 					inputLen = input.length();
! 					i=-1;
! 				}

! 			}
! 		}
! 		return reader.getParser().getStringNodeFactory().createStringNode(textBuffer, textBegin, textEnd);
! 	}
  }
--- 92,142 ----
                      textEnd=i-1;
                  }
!             }
!             if (balance_quotes && (ch=='\'' || ch=='"'))
              {
!                 if (state==PARSE_IGNORE_STATE)
                  {
                      if (ch == ignore_ender)
                          state=PARSE_HAS_BEGUN_STATE;
                  }
!                 else
                  {
                      ignore_ender = ch;
                      state = PARSE_IGNORE_STATE;
!                 }
!             }                   
!             if (state==BEFORE_PARSE_BEGINS_STATE)
!             {
!                 state=PARSE_HAS_BEGUN_STATE;
!             }
!             if (state==PARSE_HAS_BEGUN_STATE || state==PARSE_IGNORE_STATE)
!             {
!                 textBuffer.append(input.charAt(i));
!             }               
!             // Patch by Cedric Rosa
!             if (state==BEFORE_PARSE_BEGINS_STATE && i==inputLen-1)
!                state=PARSE_HAS_BEGUN_STATE;
!             if (state==PARSE_HAS_BEGUN_STATE && i==inputLen-1)
!             {
!                 do {
!                     input = reader.getNextLine();
!                     if (input!=null && input.length()==0)
!                         textBuffer.append(Parser.getLineSeparator());
!                 }
!                 while (input!=null && input.length()==0);
!                 
!                 if (input==null) {
!                     textEnd=i;
!                     state =PARSE_COMPLETED_STATE;
!                     
!                 } else {
!                     textBuffer.append(Parser.getLineSeparator());
!                     inputLen = input.length();
!                     i=-1;
!                 }

!             }
!         }
!         return reader.getParser().getStringNodeFactory().createStringNode(textBuffer, textBegin, textEnd);
!     }
  }

Index: TagParser.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/TagParser.java,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** TagParser.java	24 Aug 2003 21:59:42 -0000	1.39
--- TagParser.java	3 Sep 2003 23:36:19 -0000	1.40
***************
*** 38,244 ****

  public class TagParser {
! 	public final static int TAG_BEFORE_PARSING_STATE=1;
! 	public final static int TAG_BEGIN_PARSING_STATE=1<<2;
! 	public final static int TAG_FINISHED_PARSING_STATE=1<<3;
! 	public final static int TAG_ILLEGAL_STATE=1<<4;
! 	public final static int TAG_IGNORE_DATA_STATE=1<<5;	    
! 	public final static int TAG_IGNORE_BEGIN_TAG_STATE=1<<6;
! 	public final static int TAG_IGNORE_CHAR_SINGLE_QUOTE=1<<7;
! 	
! 	public final static String ENCOUNTERED_QUERY_MESSAGE = "TagParser : Encountered > after a query. Accepting without correction and continuing parsing";
! 	
! 	private ParserFeedback feedback;

! 	public TagParser(ParserFeedback feedback) {
! 		this.feedback = feedback;
! 	}

! 	public Tag find(NodeReader reader,String input,int position) {
! 		int state = TAG_BEFORE_PARSING_STATE;
! 		int i=position;
! 		char ch;
! 		char[] ignorechar = new char[1]; // holds the character we're looking for when in TAG_IGNORE_DATA_STATE
! 		Tag tag = new Tag(new TagData(position, 0, reader.getLastLineNumber(), 0, "", input, "", false));

! 		Bool encounteredQuery = new Bool(false);
! 		while (i<tag.getTagLine().length() && 
! 				state!=TAG_FINISHED_PARSING_STATE && 
! 				state!=TAG_ILLEGAL_STATE
! 			)
! 		{
! 			ch = tag.getTagLine().charAt(i);
! 			state = automataInput(encounteredQuery, i, state, ch, tag, i, ignorechar);
! 			i = incrementCounter(i, reader, state, tag);
! 		}
! 		if (state==TAG_FINISHED_PARSING_STATE) {
! 			String tagLine = tag.getTagLine();
! 			if (i>1 && tagLine.charAt(i-2)=='/') {
! 				tag.setEmptyXmlTag(true);
! 				String tagContents = tag.getText();
! 				tag.setText(tagContents.substring(0,tagContents.length()-1));
! 			}
! 			return tag;
! 		} else
! 			return null;
! 	}

! 	private int automataInput(Bool encounteredQuery, int i, int state,char ch, Tag tag, int pos, char[] ignorechar) {
! 		state = checkIllegalState(i, state, ch, tag);
! 		state = checkFinishedState(encounteredQuery, i, state, ch, tag, pos);
! 		state = toggleIgnoringState(state, ch, ignorechar);
! 		if (state==TAG_BEFORE_PARSING_STATE && ch!='<') {
! 			state= TAG_ILLEGAL_STATE;
! 		}
! 		if (state==TAG_IGNORE_DATA_STATE && ch=='<') {
! 			// If the next tag char is is close tag, then
! 			// this is legal, we should continue
! 			if (!isWellFormedTag(tag,pos))
! 				state = TAG_IGNORE_BEGIN_TAG_STATE;
! 		}
! 		if (state==TAG_IGNORE_BEGIN_TAG_STATE && ch=='>') {
! 			state = TAG_IGNORE_DATA_STATE;
! 		}
! 		checkIfAppendable(encounteredQuery, state, ch, tag);
! 		state = checkBeginParsingState(i, state, ch, tag);

! 		return state;
! 	}

! 	private int checkBeginParsingState(int i, int state, char ch, Tag tag) {
! 		if (ch=='<' && 
! 			(state==TAG_BEFORE_PARSING_STATE || 
! 			  state==TAG_ILLEGAL_STATE))
! 		{
! 			// Transition from State 0 to State 1 - Record data till > is encountered
! 			tag.setTagBegin(i);
! 			state = TAG_BEGIN_PARSING_STATE;
! 		}
! 		return state;
! 	}

! 	private boolean isWellFormedTag(Tag tag, int pos) {
! 		String inputLine = tag.getTagLine();
! 		int closeTagPos = inputLine.indexOf('>',pos+1);
! 		int openTagPos = inputLine.indexOf('<',pos+1);
! 		return openTagPos > closeTagPos || (openTagPos ==-1 && closeTagPos!=-1);
!  	}
!  	
! 	private int checkFinishedState(Bool encounteredQuery, int i, int state,  char ch, Tag tag, int pos) {
! 		if (ch=='>')
! 		{
! 			if (state==TAG_BEGIN_PARSING_STATE)
! 			{
! 				state = TAG_FINISHED_PARSING_STATE;
! 				tag.setTagEnd(i);
! 			}
! 			else
! 			if (state==TAG_IGNORE_DATA_STATE) {
! 				if (encounteredQuery.getBoolean()) {
! 					encounteredQuery.setBoolean(false);
! 					feedback.info(ENCOUNTERED_QUERY_MESSAGE);
! 					return state;
! 				}
! 				// Now, either this is a valid > input, and should be ignored,
! 				// or it is a mistake in the html, in which case we need to correct it *sigh*
! 				if (isWellFormedTag(tag,pos)) return state;
! 				
! 				state = TAG_FINISHED_PARSING_STATE;
! 				tag.setTagEnd(i);
! 				// Do Correction
! 				// Correct the tag - assuming its grouped into name value pairs
! 				// Remove all inverted commas.
! 				correctTag(tag);
! 			
! 				StringBuffer msg = new StringBuffer();
! 				msg.append("HTMLTagParser : Encountered > inside inverted commas in line \n");
! 				msg.append(tag.getTagLine());
! 				msg.append(", location ");
! 				msg.append(i);
! 				msg.append("\n");
! 				for (int j=0;j<i;j++) msg.append(' ');
! 				msg.append('^');
! 				msg.append("\nAutomatically corrected.");
! 				feedback.warning(msg.toString());
! 			}
! 		} else
! 		if (ch=='<' && 
! 			state==TAG_BEGIN_PARSING_STATE && 
! 			tag.getText().charAt(0)!='%'
! 			) {
! 			state = TAG_FINISHED_PARSING_STATE;
! 			tag.setTagEnd(i-1);i--;
! 		}
! 		return state;
! 	}

! 	private void checkIfAppendable(Bool encounteredQuery,int state, char ch, Tag tag) {
! 		if (state==TAG_IGNORE_DATA_STATE || 
! 			state==TAG_BEGIN_PARSING_STATE || 
! 			state==TAG_IGNORE_BEGIN_TAG_STATE) {
! 			if (ch=='?') 
! 				encounteredQuery.setBoolean(true);
! 			tag.append(ch);
! 		}
! 	}

! 	private int checkIllegalState(int i, int state, char ch, Tag tag) {
! 		if (ch=='/' && i>0 && tag.getTagLine().charAt(i-1)=='<' && 
! 			state!=TAG_IGNORE_DATA_STATE && 
! 			state!=TAG_IGNORE_BEGIN_TAG_STATE)
! 		{
! 			state = TAG_ILLEGAL_STATE;
! 		}

! 		return state;
! 	}
! 	
! 	public void correctTag(Tag tag) {
! 		String tempText = tag.getText();
! 		StringBuffer absorbedText = new StringBuffer();
! 		char c;
! 		for (int j=0;j<tempText.length();j++) {
! 			c = tempText.charAt(j);
! 			if (c!='"')
! 			absorbedText.append(c);
! 		}
! 		// Go into the next stage.
! 		StringBuffer result = insertInvertedCommasCorrectly(absorbedText);
! 		tag.setText(result.toString());
! 	}	
! 	public StringBuffer insertInvertedCommasCorrectly(StringBuffer absorbedText) {
! 		StringBuffer result = new StringBuffer();
! 		StringTokenizer tok = new StringTokenizer(absorbedText.toString(),"=",false);
! 		String token;
! 		token=  (String)tok.nextToken();
! 		result.append(token+"=");
! 		for (;tok.hasMoreTokens();) {
! 			token=  (String)tok.nextToken();
! 			token = pruneSpaces(token);
! 			result.append('"');
! 			int lastIndex = token.lastIndexOf(' ');
! 			if (lastIndex!=-1 && tok.hasMoreTokens()) {
! 				result.append(token.substring(0,lastIndex));
! 				result.append('"');
! 				result.append(token.substring(lastIndex,token.length()));
! 			} else result.append(token+'"');
! 			if (tok.hasMoreTokens()) result.append("=");
! 		}
! 		return result;
! 	}	
! 	public static String pruneSpaces(String token) {
! 		int firstSpace;
! 		int lastSpace;
! 		firstSpace = token.indexOf(' ');
! 		while (firstSpace==0) {
! 			token = token.substring(1,token.length());
! 			firstSpace = token.indexOf(' ');
! 		}
! 		lastSpace  = token.lastIndexOf(' ');
! 		while (lastSpace==token.length()-1) {
! 			token = token.substring(0,token.length()-1);
! 			lastSpace  = token.lastIndexOf(' ');
! 		}			
! 		return token;
! 	}	

      /**
--- 38,244 ----

  public class TagParser {
!     public final static int TAG_BEFORE_PARSING_STATE=1;
!     public final static int TAG_BEGIN_PARSING_STATE=1<<2;
!     public final static int TAG_FINISHED_PARSING_STATE=1<<3;
!     public final static int TAG_ILLEGAL_STATE=1<<4;
!     public final static int TAG_IGNORE_DATA_STATE=1<<5;     
!     public final static int TAG_IGNORE_BEGIN_TAG_STATE=1<<6;
!     public final static int TAG_IGNORE_CHAR_SINGLE_QUOTE=1<<7;
!     
!     public final static String ENCOUNTERED_QUERY_MESSAGE = "TagParser : Encountered > after a query. Accepting without correction and continuing parsing";
!     
!     private ParserFeedback feedback;

!     public TagParser(ParserFeedback feedback) {
!         this.feedback = feedback;
!     }

!     public Tag find(NodeReader reader,String input,int position) {
!         int state = TAG_BEFORE_PARSING_STATE;
!         int i=position;
!         char ch;
!         char[] ignorechar = new char[1]; // holds the character we're looking for when in TAG_IGNORE_DATA_STATE
!         Tag tag = new Tag(new TagData(position, 0, reader.getLastLineNumber(), 0, "", input, "", false));

!         Bool encounteredQuery = new Bool(false);
!         while (i<tag.getTagLine().length() && 
!                 state!=TAG_FINISHED_PARSING_STATE && 
!                 state!=TAG_ILLEGAL_STATE
!             )
!         {
!             ch = tag.getTagLine().charAt(i);
!             state = automataInput(encounteredQuery, i, state, ch, tag, i, ignorechar);
!             i = incrementCounter(i, reader, state, tag);
!         }
!         if (state==TAG_FINISHED_PARSING_STATE) {
!             String tagLine = tag.getTagLine();
!             if (i>1 && tagLine.charAt(i-2)=='/') {
!                 tag.setEmptyXmlTag(true);
!                 String tagContents = tag.getText();
!                 tag.setText(tagContents.substring(0,tagContents.length()-1));
!             }
!             return tag;
!         } else
!             return null;
!     }

!     private int automataInput(Bool encounteredQuery, int i, int state,char ch, Tag tag, int pos, char[] ignorechar) {
!         state = checkIllegalState(i, state, ch, tag);
!         state = checkFinishedState(encounteredQuery, i, state, ch, tag, pos);
!         state = toggleIgnoringState(state, ch, ignorechar);
!         if (state==TAG_BEFORE_PARSING_STATE && ch!='<') {
!             state= TAG_ILLEGAL_STATE;
!         }
!         if (state==TAG_IGNORE_DATA_STATE && ch=='<') {
!             // If the next tag char is is close tag, then
!             // this is legal, we should continue
!             if (!isWellFormedTag(tag,pos))
!                 state = TAG_IGNORE_BEGIN_TAG_STATE;
!         }
!         if (state==TAG_IGNORE_BEGIN_TAG_STATE && ch=='>') {
!             state = TAG_IGNORE_DATA_STATE;
!         }
!         checkIfAppendable(encounteredQuery, state, ch, tag);
!         state = checkBeginParsingState(i, state, ch, tag);

!         return state;
!     }

!     private int checkBeginParsingState(int i, int state, char ch, Tag tag) {
!         if (ch=='<' && 
!             (state==TAG_BEFORE_PARSING_STATE || 
!               state==TAG_ILLEGAL_STATE))
!         {
!             // Transition from State 0 to State 1 - Record data till > i...

[truncated message content]

Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/parserHelper AttributeParser.java,1.36,1.37 Composite

htmlparser-cvs