[Htmlparser-cvs] htmlparser/src/org/htmlparser/tests/lexerTests KitTest.java,NONE,1.1 LexerTests.jav

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests
In directory sc8-pr-cvs1:/tmp/cvs-serv9123/tests/lexerTests

Modified Files:
	AllTests.java PageIndexTests.java PageTests.java 
	SourceTests.java 
Added Files:
	KitTest.java LexerTests.java 
Log Message:
Third drop for new i/o subsystem.


--- NEW FILE: KitTest.java ---
/*
 * KitTest.java
 *
 * Created on August 16, 2003, 2:16 PM
 */

package org.htmlparser.tests.lexerTests;

import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.util.Vector;
import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.EditorKit;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTMLEditorKit.Parser;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.lexer.nodes.Attribute;
import org.htmlparser.lexer.nodes.TagNode;
import org.htmlparser.util.ParserException;

/**
 *
 * @author  derrick
 */
public class KitTest extends ParserCallback
{
    Vector mNodes;
    int mIndex;
    
    /** Creates a new instance of KitTest */
    public KitTest (Vector nodes)
    {
        mNodes = nodes;
        mIndex = 0;
    }

    public void handleText (char[] data, int pos)
    {
        StringBuffer sb;
        String theirs;
        Node node;
        int match;
        String ours;

        sb = new StringBuffer (data.length);
        for (int i = 0; i < data.length; i++)
        {
            if (160 == (int)data[i])
                sb.append ("&nbsp;");
            else
                sb.append (data[i]);
        }
        theirs = sb.toString ();
        match = -1;
        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
        {
            node = (Node)mNodes.elementAt (i);
            ours = node.getText ();
            if (theirs.equalsIgnoreCase (ours))
            {
                match = i;
                break;
            }
        }
        if (-1 == match)
        {
            node = (Node)mNodes.elementAt (mIndex);
            ours = node.getText ();
            System.out.println ("theirs: " + theirs);
            System.out.println ("  ours: " + ours);
            mIndex++;
        }
        else
        {
//            System.out.println (" match: " + theirs);
            mIndex = match + 1;
        }
    }
    
    public void handleComment (char[] data, int pos)
    {
        StringBuffer sb;
        String theirs;
        Node node;
        int match;
        String ours;

        sb = new StringBuffer (data.length);
        sb.append (data);
        theirs = sb.toString ();
        match = -1;
        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
        {
            node = (Node)mNodes.elementAt (i);
            ours = node.getText ();
            if (theirs.equalsIgnoreCase (ours))
            {
                match = i;
                break;
            }
        }
        if (-1 == match)
        {
            node = (Node)mNodes.elementAt (mIndex);
            ours = node.getText ();
            System.out.println ("theirs: " + theirs);
            System.out.println ("  ours: " + ours);
            mIndex++;
        }
        else
        {
//            System.out.println (" match: " + theirs);
            mIndex = match + 1;
        }
    }
    
    public void handleStartTag (HTML.Tag t, MutableAttributeSet a, int pos)
    {
        StringBuffer sb;
        String theirs;
        Node node;
        int match;
        String ours;

        theirs = t.toString ();
        match = -1;
        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
        {
            node = (Node)mNodes.elementAt (i);
            if (node instanceof TagNode)
            {
                ours = ((Attribute)(((TagNode)node).getAttributesEx ().elementAt (0))).getName ();
                if (theirs.equalsIgnoreCase (ours))
                {
                    match = i;
                    break;
                }
            }
        }
        if (-1 == match)
        {
            node = (Node)mNodes.elementAt (mIndex);
            ours = node.getText ();
            System.out.println ("theirs: " + theirs);
            System.out.println ("  ours: " + ours);
            mIndex++;
        }
        else
        {
//            System.out.println (" match: " + theirs);
            mIndex = match + 1;
        }
    }
    
    public void handleEndTag (HTML.Tag t, int pos)
    {
        StringBuffer sb;
        String theirs;
        Node node;
        int match;
        String ours;

        theirs = t.toString ();
        match = -1;
        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
        {
            node = (Node)mNodes.elementAt (i);
            if (node instanceof TagNode)
            {
                ours = ((Attribute)(((TagNode)node).getAttributesEx ().elementAt (0))).getName ().substring (1);
                if (theirs.equalsIgnoreCase (ours))
                {
                    match = i;
                    break;
                }
            }
        }
        if (-1 == match)
        {
            node = (Node)mNodes.elementAt (mIndex);
            ours = node.getText ();
            System.out.println ("theirs: " + theirs);
            System.out.println ("  ours: " + ours);
            mIndex++;
        }
        else
        {
//            System.out.println (" match: " + theirs);
            mIndex = match + 1;
        }
    }
    
    public void handleSimpleTag (HTML.Tag t, MutableAttributeSet a, int pos)
    {
        StringBuffer sb;
        String theirs;
        Node node;
        int match;
        String ours;

        theirs = t.toString ();
        match = -1;
        for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
        {
            node = (Node)mNodes.elementAt (i);
            if (node instanceof TagNode)
            {
                ours = ((Attribute)(((TagNode)node).getAttributesEx ().elementAt (0))).getName ();
                if (theirs.equalsIgnoreCase (ours))
                {
                    match = i;
                    break;
                }
                else if (theirs.equalsIgnoreCase (ours.substring (1)))
                {
                    match = i;
                    break;
                }
            }
        }
        if (-1 == match)
        {
            node = (Node)mNodes.elementAt (mIndex);
            ours = node.getText ();
            System.out.println ("theirs: " + theirs);
            System.out.println ("  ours: " + ours);
            mIndex++;
        }
        else
        {
//            System.out.println (" match: " + theirs);
            mIndex = match + 1;
        }
    }

    
    public void handleError (String errorMsg, int pos)
    {
//        System.out.println ("******* error @" + pos + " ******** " + errorMsg);
    }
    
    public void flush () throws BadLocationException
    {
    }
    
    /**
     * This is invoked after the stream has been parsed, but before
     * <code>flush</code>. <code>eol</code> will be one of \n, \r
     * or \r\n, which ever is encountered the most in parsing the
     * stream.
     *
     * @since 1.3
     */
    public void handleEndOfLineString (String eol)
    {
    }

//    /**
//     * Get the document data from the URL.
//     * @param rd The reader to read bytes from.
//     * @return The parsed HTML document.
//     */
//    protected static Element[] getData (Reader rd) throws IOException
//    {
//        EditorKit kit;
//        Document doc;
//        Element[] ret;
//                                                                                                                                
//        ret = null;
//                                                                                                                                
//        // need this because HTMLEditorKit is not thread safe apparently
//        synchronized (Boolean.TRUE)
//        {
//            kit = new HTMLEditorKit ();
//            doc = kit.createDefaultDocument ();
//            // the Document class does not yet handle charset's properly
//            doc.putProperty ("IgnoreCharsetDirective", Boolean.TRUE);
//                                                                                                                                
//            try
//            {
//                // parse the HTML
//                kit.read (rd, doc, 0);
//            }
//            catch (BadLocationException ble)
//            {
//                throw new IOException ("parse error " + ble.getMessage ());
//            }
//                                                                                                                                
//            ret = doc.getRootElements ();
//        }
//                                                                                                                                
//        return (ret);
//    }

//    public static void scanElements (Element element) throws BadLocationException
//    {
//        int start;
//        int end;
//        String string;
//        ElementIterator it;
//        Element child;
//        
//        if (element.isLeaf ())
//        {
//            start = element.getStartOffset ();
//            end = element.getEndOffset ();
//            string = element.getDocument ().getText (start, end - start);
//            System.out.println (string);
//        }
//        else
//            // iterate through the elements of the element
//            for (int i = 0; i < element.getElementCount (); i++)
//            {
//                child = element.getElement (i);
//                scanElements (child);
//            }
//    }

    class MyKit extends HTMLEditorKit
    {
        public MyKit ()
        {
        }

        public HTMLEditorKit.Parser getParser () 
        {
            return (super.getParser ());
        }
    }
    
    public MyKit getKit ()
    {
        return (new MyKit ());
    }

    /**
     * @param args the command line arguments
     */
    public static void main (String[] args) throws ParserException, IOException
    {
        Lexer lexer;
        Node node;
        Vector nodes;
        KitTest test;
        MyKit kit;
        Parser parser;
        
        Element[] elements;

        // pass through it once to read the entire page
        URL url = new URL ("http://sourceforge.net/projects/htmlparser");
        lexer = new Lexer (url.openConnection ());
        nodes = new Vector ();
        while (null != (node = lexer.nextNode ()))
            nodes.addElement (node);

        // reset the reader
        lexer.getPage ().getSource ().reset ();
        test = new KitTest (nodes);
        kit = test.getKit ();
        parser = kit.getParser ();
        parser.parse ((Reader)lexer.getPage ().getSource (), (ParserCallback)test, true);
    }
    
}

--- NEW FILE: LexerTests.java ---
// HTMLParser Library v1_4_20030810 - A java-based parser for HTML
// Copyright (C) Dec 31, 2000 Somik Raha
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// For any questions or suggestions, you can write to me at :
// Email :so...@in...
// 
// Postal Address : 
// Somik Raha
// Extreme Programmer & Coach
// Industrial Logic Corporation
// 2583 Cedar Street, Berkeley, 
// CA 94708, USA
// Website : http://www.industriallogic.com

package org.htmlparser.tests.lexerTests;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;

import junit.framework.TestCase;

import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.lexer.PageIndex;
import org.htmlparser.lexer.Stream;
import org.htmlparser.lexer.nodes.RemarkNode;
import org.htmlparser.lexer.nodes.StringNode;
import org.htmlparser.lexer.nodes.TagNode;
import org.htmlparser.util.ParserException;

public class LexerTests extends TestCase
{
    
    /**
     * Test the Lexer class.
     */
	public LexerTests (String name)
    {
		super (name);
	}

    /**
     * Test operation without tags.
     */
    public void testPureText () throws ParserException
    {
        String reference;
        Lexer lexer;
        StringNode node;

        reference = "Hello world";
        lexer = new Lexer (reference);
        node = (StringNode)lexer.nextNode ();
        assertEquals ("StringNode contents wrong", reference, node.getText ());
    }

    /**
     * Test operation with Unix line endings.
     */
    public void testUnixEOL () throws ParserException
    {
        String reference;
        Lexer lexer;
        StringNode node;

        reference = "Hello\nworld";
        lexer = new Lexer (reference);
        node = (StringNode)lexer.nextNode ();
        assertEquals ("StringNode contents wrong", reference, node.getText ());
    }

    /**
     * Test operation with Dos line endings.
     */
    public void testDosEOL () throws ParserException
    {
        String reference;
        Lexer lexer;
        StringNode node;

        reference = "Hello\r\nworld";
        lexer = new Lexer (reference);
        node = (StringNode)lexer.nextNode ();
        assertEquals ("StringNode contents wrong", reference, node.getText ());
        reference = "Hello\rworld";
        lexer = new Lexer (reference);
        node = (StringNode)lexer.nextNode ();
        assertEquals ("StringNode contents wrong", reference, node.getText ());
    }

    /**
     * Test operation with line endings near the end of input.
     */
    public void testEOF_EOL () throws ParserException
    {
        String reference;
        Lexer lexer;
        StringNode node;

        reference = "Hello world\n";
        lexer = new Lexer (reference);
        node = (StringNode)lexer.nextNode ();
        assertEquals ("StringNode contents wrong", reference, node.getText ());
        reference = "Hello world\r";
        lexer = new Lexer (reference);
        node = (StringNode)lexer.nextNode ();
        assertEquals ("StringNode contents wrong", reference, node.getText ());
        reference = "Hello world\r\n";
        lexer = new Lexer (reference);
        node = (StringNode)lexer.nextNode ();
        assertEquals ("StringNode contents wrong", reference, node.getText ());
    }

    /**
     * Test that tags stop string nodes.
     */
    public void testTagStops () throws ParserException
    {
        String[] references =
        {
            "Hello world",
            "Hello world\n",
            "Hello world\r\n",
            "Hello world\r",
            
        };
        String[] suffixes =
        {
            "<head>",
            "</head>",
            "<%=head%>",
            "<!--head-->",
        };
        Lexer lexer;
        StringNode node;

        for (int i = 0; i < references.length; i++)
        {
            for (int j = 0; j < suffixes.length; j++)
            {
                lexer = new Lexer (references[i] + suffixes[j]);
                node = (StringNode)lexer.nextNode ();
                assertEquals ("StringNode contents wrong", references[i], node.getText ());
            }
        }
    }

    /**
     * Test operation with only tags.
     */
    public void testPureTag () throws ParserException
    {
        String reference;
        String suffix;
        Lexer lexer;
        TagNode node;

        reference = "<head>";
        lexer = new Lexer (reference);
        node = (TagNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<head>";
        suffix = "<body>";
        lexer = new Lexer (reference + suffix);
        node = (TagNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());
        node = (TagNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", suffix, node.toHtml ());
    }

    /**
     * Test operation with attributed tags.
     */
    public void testAttributedTag () throws ParserException
    {
        String reference;
        Lexer lexer;
        TagNode node;

        reference = "<head lang='en_US' dir=ltr\nprofile=\"http://htmlparser.sourceforge.org/dictionary.html\">";
        lexer = new Lexer (reference);
        node = (TagNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());
    }

    /**
     * Test operation with comments.
     */
    public void testRemarkNode () throws ParserException
    {
        String reference;
        Lexer lexer;
        RemarkNode node;
        String suffix;

        reference = "<!-- This is a comment -->";
        lexer = new Lexer (reference);
        node = (RemarkNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a comment --  >";
        lexer = new Lexer (reference);
        node = (RemarkNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a\nmultiline comment -->";
        lexer = new Lexer (reference);
        node = (RemarkNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());
        
        suffix = "<head>";
        reference = "<!-- This is a comment -->";
        lexer = new Lexer (reference + suffix);
        node = (RemarkNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a comment --  >";
        lexer = new Lexer (reference + suffix);
        node = (RemarkNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a\nmultiline comment -->";
        lexer = new Lexer (reference + suffix);
        node = (RemarkNode)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());
    }

    /**
     * Try a real page.
     */
    public void testReal () throws ParserException, IOException
    {
        Lexer lexer;
        Node node;

        URL url = new URL ("http://sourceforge.net/projects/htmlparser");
        lexer = new Lexer (url.openConnection ());
        while (null != (node = lexer.nextNode ()))
            System.out.println (node.toString ());
    }

    
}

Index: AllTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/AllTests.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** AllTests.java	11 Aug 2003 00:18:31 -0000	1.4
--- AllTests.java	17 Aug 2003 16:09:27 -0000	1.5
***************
*** 46,49 ****
--- 46,50 ----
  		suite.addTestSuite (PageTests.class);
          suite.addTestSuite (PageIndexTests.class);
+         suite.addTestSuite (LexerTests.class);
          return suite; 
  	}

Index: PageIndexTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/PageIndexTests.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** PageIndexTests.java	11 Aug 2003 00:18:31 -0000	1.2
--- PageIndexTests.java	17 Aug 2003 16:09:27 -0000	1.3
***************
*** 123,131 ****
              // test for correct position
              if (0 <= pos - 1)
!                 assertTrue ("search error less " + pos + " " + index.elementAt (pos - 1) + " " + n, index.elementAt (pos - 1) < n);
              if (pos + 1 < index.size ())
                  assertTrue ("search error greater " + pos + " " + index.elementAt (pos + 1) + " " + n, index.elementAt (pos + 1) > n);
-                     
-             assertTrue ("wrong position", pos == index.add (n));
          }
  
--- 123,129 ----
              // test for correct position
              if (0 <= pos - 1)
!                 assertTrue ("search error less " + pos + " " + index.elementAt (pos - 1) + " " + n, index.elementAt (pos - 1) <= n);
              if (pos + 1 < index.size ())
                  assertTrue ("search error greater " + pos + " " + index.elementAt (pos + 1) + " " + n, index.elementAt (pos + 1) > n);
          }
  

Index: PageTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/PageTests.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** PageTests.java	11 Aug 2003 00:18:31 -0000	1.4
--- PageTests.java	17 Aug 2003 16:09:27 -0000	1.5
***************
*** 66,70 ****
          try
          {
!             page = new Page (null);
              assertTrue ("null value in constructor", false);
          }
--- 66,80 ----
          try
          {
!             page = new Page ((URLConnection)null);
!             assertTrue ("null value in constructor", false);
!         }
!         catch (IllegalArgumentException iae)
!         {
!             // expected outcome
!         }
! 
!         try
!         {
!             page = new Page ((String)null);
              assertTrue ("null value in constructor", false);
          }

Index: SourceTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/SourceTests.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** SourceTests.java	11 Aug 2003 00:18:31 -0000	1.3
--- SourceTests.java	17 Aug 2003 16:09:27 -0000	1.4
***************
*** 106,110 ****
          source = new Source (new Stream (new ByteArrayInputStream ("hello word".getBytes ())), null);
          assertTrue ("no character", -1 != source.read ());
!         source.close ();
          try
          {
--- 106,110 ----
          source = new Source (new Stream (new ByteArrayInputStream ("hello word".getBytes ())), null);
          assertTrue ("no character", -1 != source.read ());
!         source.destroy ();
          try
          {

[Htmlparser-cvs] htmlparser/src/org/htmlparser/tests/lexerTests KitTest.java,NONE,1.1 LexerTests.jav

[Htmlparser-cvs] htmlparser/src/org/htmlparser/tests/lexerTests KitTest.java,NONE,1.1 LexerTests.java,NONE,1.1 AllTests.java,1.4,1.5 PageIndexTests.java,1.2,1.3 PageTests.java,1.4,1.5 SourceTests.java,1.3,1.4