Re: [Htmlparser-developer] Re: Htmlparser-developer digest, Vol 1 #136 - 3 msgs

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Hi Sam,
    The parse() is not being called, but the print() method is. From three
places :
NeurogridHTMLParserTest.printLinks()
NeurogridHTMLParserTest.printMetaTags()
NeurogridHTMLParser.searchForSummaryContents()

    If you mask this, output will be as you desire.
Regards,
Somik
----- Original Message -----
From: "Sam Joseph" <ga...@yh...>
To: <htm...@li...>
Sent: Wednesday, December 11, 2002 3:41 PM
Subject: [Htmlparser-developer] Re: Htmlparser-developer digest, Vol 1
#136 - 3 msgs


> Hi Somik,
>
> Sorry that my mails are not attaching to the thread properly. I'm on
> digest, so when I reply to the digest meesage I think a new thread get
> automatically started, and the sourceforge mail interface doesn't let me
> reply directly to your messages
>
> Thanks for your suggestion below. As far as I can see from the code the
> parse method on HTMLParser is not being called. In fact it uses exactly
> the think you describe in your mail. I didn't really write this code.
> It's still basically the NeuroGridHTMLParser that you wrote a while
> back, modified into my coding format.
>
> Please find the code appended to this email. Both the links I have been
> parsing are specified in the NeuroGridHTMLParserTest.java file.
>
> Thanks in advance.
>
> CHEERS> SAM
>
> Somik wrote:
>
> >Sorry, I just saw your other mail again with the
> >output. I see the problem -
> >
> >You must be calling the parse method in
> >HTMLParser.java. That is only a demo. As mentioned in
> >the docs, you should be doing something like :
> >
> >(for HTMLEnumeration e =
> >parser.elements();e.hasMoreNodes();) {
> >   HTMLNode node = e.nextHTMLNode();
> >   // create summary here
> >}
> >
> >The call to parse has the printing stuff which prints
> >all the details of the nodes (calling node.print()).
> >
> >If this does not help, can you post your complete
> >parsing program ?
> >
>
>


----------------------------------------------------------------------------
----


> /*
>  * (c) Copyright 2001 MyCorporation.
>  * All Rights Reserved.
>  */
> package com.neurogrid.parser;
> /**
>  * @version 1.0
>  * @author
>  */
> public class Summary {
> private String heading;
> private String contents;
> /**
> * Constructor for Summary.
> */
> public Summary(String heading, String contents) {
> this.heading = heading;
> this.contents = contents;
> }
>
> /**
> * Gets the heading.
> * @return Returns a String
> */
> public String getHeading() {
> return heading;
> }
>
> /**
> * Sets the heading.
> * @param heading The heading to set
> */
> public void setHeading(String heading) {
> this.heading = heading;
> }
>
> /**
> * Gets the contents.
> * @return Returns a String
> */
> public String getContents() {
> return contents;
> }
>
> /**
> * Sets the contents.
> * @param contents The contents to set
> */
> public void setContents(String contents) {
> this.contents = contents;
> }
>
> public String toString() {
> String retString;
> if (heading.length()>0) retString = heading+"\n"+contents;
> else retString = contents;
> return retString;
> }
> }
>


----------------------------------------------------------------------------
----


> package com.neurogrid.parser;
>
> /*
>  * Copyright (C) 2000 NeuroGrid <sa...@ne...>
>  *
>  * This program is free software; you can redistribute it and/or
>  * modify it under the terms of the GNU General Public License
>  * as published by the Free Software Foundation; either version 2
>  * of the License, or (at your option) any later version.
>  *
>  * This program is distributed in the hope that it will be useful,
>  * but WITHOUT ANY WARRANTY; without even the implied warranty of
>  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>  * GNU General Public License for more details.
>  *
>  * You should have received a copy of the GNU General Public License
>  * along with this program; if not, write to the Free Software
>  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
USA.
>  *
>  * You may find further details about this software at
>  * http://www.neurogrid.net/
>  */
>
> import junit.framework.*;
>
>  // Import log4j classes.
> import org.apache.log4j.Category;
> import org.apache.log4j.BasicConfigurator;
> import org.apache.log4j.PropertyConfigurator;
>
> import org.htmlparser.*;
> import org.htmlparser.tags.*;
> import org.htmlparser.scanners.*;
> import org.htmlparser.util.*;
> import java.util.Enumeration;
> import java.util.Vector;
>
> /**
>  * @version 1.0
>  * @author
>  */
> public class NeuroGridHTMLParser
> {
>   private static final String cvsInfo = "$Id:$";
>   public static String getCvsInfo()
>   {
>     return cvsInfo;
>   }
>
>   private static Category o_cat =
Category.getInstance(NeuroGridHTMLParser.class.getName());
>
>   /**
>    * initialize the logging system
>    *
>    * @param p_conf      configuration filename
>    */
>   public static void init(String p_conf)
>   {
>     BasicConfigurator.configure();
>     PropertyConfigurator.configure(p_conf);
>     o_cat.info("NeuroGridHTMLParser logging Initialized");
>   }
>
>   private String o_url;
>   private String o_full_text;
>   private Vector o_meta_tags;
>   private Vector o_link_tags;
>   private Summary o_summary;
>   private StringBuffer o_summary_heading;
>   private StringBuffer o_summary_contents;
>   private HTMLParser o_parser               = null;
>   private boolean o_h1_tag_found            = false;
>   private boolean o_start_summary_search    = false;
>   private int o_summary_count               = 0;
>
>
>   /**
>    * This constructor is only to enable test cases.
>    * For clients, pls use NeuroGridHTMLParser(String)
>    * or NeuroGridHTMLParser(String,boolean)
>    *
>    * @param p_parser
>    */
>   public NeuroGridHTMLParser(HTMLParser p_parser)
>     throws Exception
>   {
>     this("",false);
>     o_parser = p_parser;
>   }
>
>   /**
>    *
>    * @param p_url
>    */
>   public NeuroGridHTMLParser(String p_url)
>     throws Exception
>   {
>     this(p_url,true);
>   }
>
>   /**
>    *
>    * @param p_url
>    * @param p_start_parsing
>    */
>   public NeuroGridHTMLParser(String p_url, boolean p_start_parsing)
>     throws Exception
>   {
>     o_url = p_url;
>     o_meta_tags = new Vector();
>     o_link_tags = new Vector();
>     o_summary_heading = new StringBuffer();
>     o_summary_contents = new StringBuffer();
>     if (p_start_parsing) parse();
>   }
>
>   private class BlankHTMLParserFeedback
>     implements HTMLParserFeedback
>   {
>     public void info(String message)
>     {
>       //System.out.println("INFO: " + message);
>     }
>
>     public void warning(String message)
>     {
>       //System.out.println("WARNING: " + message);
>     }
>
>     public void error(String message, HTMLParserException e)
>     {
>       //System.out.println("ERROR: " + message);
>       e.printStackTrace();
>     }
>   }
>
>
>
>   /**
>    * parse the page
>    */
>   public final void parse()
>     throws Exception
>   {
>     if (o_parser==null)
>       o_parser = new HTMLParser(o_url, new BlankHTMLParserFeedback());
>
>     o_parser.addScanner(new HTMLMetaTagScanner("-t"));
>     o_parser.addScanner(new HTMLLinkScanner("-l"));
>     o_parser.addScanner(new HTMLTitleScanner("-a"));
>     parseURLForData();
>     o_summary = createSummary();
>   }
>
>   /**
>    * parse the URL for data
>    */
>   private void parseURLForData()
>     throws Exception
>   {
>     HTMLNode x_node;
>     for (HTMLEnumeration e = o_parser.elements();e.hasMoreNodes();)
>     {
>       x_node = e.nextHTMLNode();
>       checkForTitle(x_node);
>       checkForMetaTag(x_node);
>       checkForLinkTag(x_node);
>       checkForTag(x_node);
>       if(o_h1_tag_found == true)
>       {
>         o_h1_tag_found = processH1Tag(x_node);
>       }
>       else
>       {
>       if (o_start_summary_search)
>       {
>         searchForSummaryContents(x_node);
>       }
>         addToFullText(x_node);
>       }
>
>     }
>   }
>
>   /**
>    * parse the URL for data
>    *
>    * @param HTMLNode
>    */
>   protected void checkForTitle(HTMLNode p_node)
>   {
>     if(p_node instanceof HTMLTitleTag)
>     {
>       String x_title = ((HTMLTitleTag)p_node).getTitle();
>       o_cat.debug("appending title: " + x_title);
>       // I think it would be better to do one or the other of H1 and
title.
>       //FIXXXXXXXXX
>       o_summary_heading.append(x_title+"\n");
>     }
>   }
>
>   /**
>    * add this nodes text to the full text
>    *
>    * @param HTMLNode
>    */
>   private void addToFullText(HTMLNode p_node)
>   {
>     if(p_node instanceof HTMLStringNode)
>     {
>       o_full_text += ((HTMLStringNode)p_node).getText();
>     }
>   }
>
>   /**
>    * search for summary contents
>    *
>    * @param HTMLNode
>    */
>   private void searchForSummaryContents(HTMLNode p_node)
>   {
>     if(p_node instanceof HTMLStringNode)
>     {
>       //o_cat.debug("*** SEARCHING FOR SUMMARY ***");
>       p_node.print();
>       String x_contents = ((HTMLStringNode)p_node).getText();
>       if(x_contents.length()>0 && isAlphabetical(x_contents) &&
!isEmpty(x_contents))
>       {
>         //o_cat.debug("x_contents = "+x_contents);
>         o_summary_count++;
>         o_summary_contents.append(x_contents+"\n");
>         if(o_summary_count==2)
>         {
>           o_start_summary_search=false;
>         }
>       }
>     }
>   }
>
>   /**
>    * check if this string is just spaces
>    *
>    * @param p_text
>    *
>    * @return boolean
>    */
>   private boolean isEmpty(String p_text)
>   {
>     boolean x_empty = true;
>     for (int i=0;i<p_text.length();i++)
>     {
>       if (p_text.charAt(i) != ' ')
>       {
>         x_empty = false;
>       }
>     }
>     return x_empty;
>   }
>
>   /**
>    * check if this string is alphabetical
>    *
>    * @param p_text
>    *
>    * @return boolean
>    */
>   private boolean isAlphabetical(String p_text)
>   {
>     char x_ch;
>     p_text = p_text.toUpperCase();
>     boolean x_return = true;
>     for(int i=0;i<p_text.length();i++)
>     {
>       x_ch = p_text.charAt(i);
>       if (!((x_ch>='A' && x_ch <='Z')|| (x_ch==' ' || x_ch=='.' ||
x_ch==',')))
>       {
>       x_return =false;
>       }
>     }
>     return x_return;
>   }
>
>   /**
>    * check for a tag
>    *
>    * @param p_node
>    */
>   private void checkForTag(HTMLNode p_node)
>   {
>     if(p_node instanceof HTMLTag)
>     {
>       HTMLTag x_tag = (HTMLTag)p_node;
>       checkForH1Tag(x_tag);
>       checkForBodyTag(x_tag);
>     }
>   }
>
>   /**
>    * check for a body tag
>    *
>    * @param p_node
>    */
>   private void checkForBodyTag(HTMLTag p_tag)
>   {
>     if(p_tag.getText().toUpperCase().indexOf("BODY")!=-1)
>     {
>       o_start_summary_search = true;
>     }
>   }
>
>   /**
>    * check for an H1 tag
>    *
>    * @param p_node
>    */
>   private void checkForH1Tag(HTMLTag tag)
>   {
>     if (tag.getText().toUpperCase().equals("H1"))
>     {
>       o_h1_tag_found = true;
>     }
>   }
>
>   /**
>    * check for a meta tag
>    *
>    * @param p_node
>    */
>   private void checkForMetaTag(HTMLNode p_node)
>   {
>     HTMLMetaTag x_meta_tag;
>     if(p_node instanceof HTMLMetaTag)
>     {
>       x_meta_tag = (HTMLMetaTag) p_node;
>       o_meta_tags.addElement(x_meta_tag);
>     }
>   }
>
>   /**
>    * check for a link tag
>    *
>    * @param p_node
>    */
>   private void checkForLinkTag(HTMLNode p_node)
>   {
>     HTMLLinkTag x_link_tag;
>     if(p_node instanceof HTMLLinkTag)
>     {
>       x_link_tag = (HTMLLinkTag)p_node;
>       o_link_tags.addElement(x_link_tag);
>     }
>   }
>
>   /**
>    * process an H1 tag
>    *
>    * @param p_node
>    *
>    * @return boolean
>    */
>   private boolean processH1Tag(HTMLNode p_node)
>   {
>     boolean x_h1_tag_found = true;
>     if(p_node instanceof HTMLStringNode)
>     {
>       o_summary_heading.append(((HTMLStringNode)p_node).getText());
>       o_cat.debug("appending title: " +
((HTMLStringNode)p_node).getText());
>       // I think it would be better to do one or the other of H1 and
title.
>       //FIXXXXXXXXX
>     }
>     if(p_node instanceof HTMLEndTag)
>     {
>       HTMLEndTag x_end_tag =(HTMLEndTag)p_node;
>       //o_cat.debug("x_end_tag.toString(): " + x_end_tag.toString());
>       //o_cat.debug("x_end_tag.toHTML(): " + x_end_tag.toHTML());
>       //o_cat.debug("x_end_tag.toPlainTextString(): " +
x_end_tag.toPlainTextString());
>       //o_cat.debug("x_end_tag.getTagName(): " + x_end_tag.getTagName());
>       //o_cat.debug("x_end_tag.getText(): " + x_end_tag.getText());
>       if(x_end_tag.getTagName().toUpperCase().equals("H1"))
>       {
>         x_h1_tag_found = false;
>       }
>     }
>     return x_h1_tag_found;
>   }
>
>
>
>   /**
>    * get the Summary
>    *
>    * @return Summary
>    */
>    public Summary getSummary()
>    {
>      return o_summary;
>    }
>
>   /**
>    * get the Full text
>    *
>    * @return String
>    */
>    public String getFullText()
>    {
>      return o_full_text;
>    }
>
>   /**
>    * get a vector of the links
>    *
>    * @return Vector
>    */
>    public Vector links()
>    {
>      return o_link_tags;
>    }
>
>   /**
>    * get a vector of meta tags
>    *
>    * @return Vector
>    */
>    public Vector metaTags()
>    {
>      return o_meta_tags;
>    }
>
>   /**
>    * create a summary
>    *
>    * @return Summary
>    */
>    private Summary createSummary()
>    {
>      return new
Summary(o_summary_heading.toString(),o_summary_contents.toString());
>    }
>
>
>   /**
>    * main
>    *
>    * @param args
>    */
>   public static void main(String[] args)
>   {
>   try
>   {
>     if (args.length==0)
>     {
>       o_cat.debug("Syntax:");
>       o_cat.debug("java -jar neuroparser.jar URL");
>       System.exit(-1);
>     }
>     o_cat.debug("Parsing "+args[0]+"..");
>     o_cat.debug("");
>     NeuroGridHTMLParser parser = new NeuroGridHTMLParser(args[0]);
>     o_cat.debug("Printing links from "+args[0]);
>     o_cat.debug("");
>
>     printLinks(parser);
>     printMetaTags(args, parser);
>     printSummary(parser);
>     printFullText(parser);
>         }
>         catch(Exception e)
>         {e.printStackTrace();}
>   }
>
>   public static void printSummary(NeuroGridHTMLParser parser)
>   {
>     o_cat.debug("");
>     o_cat.debug("Summary");
>     o_cat.debug("-------");
>     o_cat.debug(parser.getSummary());
>     o_cat.debug("");
>   }
>
>   public static void printFullText(NeuroGridHTMLParser parser)
>   {
>     o_cat.debug("");
>     o_cat.debug("Full Text");
>     o_cat.debug("-------");
>     o_cat.debug(parser.getFullText());
>     o_cat.debug("");
>   }
>
>   public static void printMetaTags(String[] args, NeuroGridHTMLParser
parser)
>   {
>     HTMLMetaTag metaTag;
>     o_cat.debug("");
>     o_cat.debug("Printing metaTags from "+args[0]);
>     o_cat.debug("");
>     for(Enumeration e = parser.metaTags().elements();e.hasMoreElements();)
>     {
>       metaTag = (HTMLMetaTag)e.nextElement();
>       metaTag.print();
>     }
>   }
>
>   public static void printLinks(NeuroGridHTMLParser parser)
>   {
>     HTMLLinkTag link;
>     for(Enumeration e =parser.links().elements();e.hasMoreElements();)
>     {
>       link = (HTMLLinkTag)e.nextElement();
>       link.print();
>     }
>   }
> }
>


----------------------------------------------------------------------------
----


> package com.neurogrid.parser;
>
> /*
>  * Copyright (C) 2000 NeuroGrid <sa...@ne...>
>  *
>  * This program is free software; you can redistribute it and/or
>  * modify it under the terms of the GNU General Public License
>  * as published by the Free Software Foundation; either version 2
>  * of the License, or (at your option) any later version.
>  *
>  * This program is distributed in the hope that it will be useful,
>  * but WITHOUT ANY WARRANTY; without even the implied warranty of
>  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>  * GNU General Public License for more details.
>  *
>  * You should have received a copy of the GNU General Public License
>  * along with this program; if not, write to the Free Software
>  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
USA.
>  *
>  * You may find further details about this software at
>  * http://www.neurogrid.net/
>  */
>
> import junit.framework.*;
>
>  // Import log4j classes.
> import org.apache.log4j.Category;
> import org.apache.log4j.BasicConfigurator;
> import org.apache.log4j.PropertyConfigurator;
>
> import org.htmlparser.*;
> import org.htmlparser.tags.*;
> import org.htmlparser.scanners.*;
> import java.util.Enumeration;
> import java.util.Vector;
>
>
> /**
>  * @version 1.0
>  * @author
>  */
> public class NeuroGridHTMLParserTest
>     extends TestCase
> {
>   private static final String cvsInfo = "$Id:$";
>   public static String getCvsInfo()
>   {
>     return cvsInfo;
>   }
>
>   private static Category o_cat =
Category.getInstance(NeuroGridHTMLParserTest.class.getName());
>
>   /**
>    * initialize the logging system
>    *
>    * @param p_conf      configuration filename
>    */
>   public static void init(String p_conf)
>   {
>     BasicConfigurator.configure();
>     PropertyConfigurator.configure(p_conf);
>     o_cat.info("NeuroGridHTMLParserTest logging Initialized");
>   }
>
>   public static void main(String[] args)
>   {
>     NeuroGridHTMLParserTest.start();
>     NeuroGridHTMLParserTest.init(args[0]);
>     NeuroGridHTMLParserTest.testStuff();
>   }
>
>   /**
>    * Subclasses must invoke this from their constructor.
>    */
>   public NeuroGridHTMLParserTest(String p_name)
>   {
>     super(p_name);
>   }
>
>   protected void setUp()
>   {
>     start();
>   }
>
>   protected static void start()
>   {
>     try
>     {
>       NeuroGridHTMLParserTest.init("conf/log4j.properties");
>       NeuroGridHTMLParser.init("conf/log4j.properties");
>     }
>     catch(Exception e){e.printStackTrace();}
>   }
>
>   /**
>    * test some stuff
>    */
>   public static void testStuff()
>   {
>     try
>     {
> //      String x_url =
"http://belle.designwest.com/examples/test04b.html";
>       String x_url =
"http://home.att.ne.jp/red/gaijin/tribal-hardware/index.htm";
>
>       o_cat.debug("Parsing "+x_url+"..");
>       o_cat.debug("");
>       NeuroGridHTMLParser parser = new NeuroGridHTMLParser(x_url);
>       o_cat.debug("Printing links from "+x_url);
>       o_cat.debug("");
>
>       printLinks(parser);
>       printMetaTags(x_url, parser);
>       printSummary(parser);
>       printFullText(parser);
>     }
>     catch(Exception e)
>     {e.printStackTrace();}
>   }
>
>
>   public static void printSummary(NeuroGridHTMLParser parser)
>   {
>     o_cat.debug("");
>     o_cat.debug("Summary");
>     o_cat.debug("-------");
>     o_cat.debug(parser.getSummary().getHeading());
>     o_cat.debug("-------");
>     o_cat.debug(parser.getSummary().getContents());
>     o_cat.debug("-------");
>     o_cat.debug("");
>   }
>
>   public static void printFullText(NeuroGridHTMLParser parser)
>   {
>     o_cat.debug("");
>     o_cat.debug("Full Text");
>     o_cat.debug("-------");
>     o_cat.debug(parser.getFullText());
>     o_cat.debug("");
>   }
>
>   public static void printMetaTags(String p_url, NeuroGridHTMLParser
parser)
>   {
>     HTMLMetaTag metaTag;
>     o_cat.debug("");
>     o_cat.debug("Printing metaTags from "+p_url);
>     o_cat.debug("");
>     for(Enumeration e = parser.metaTags().elements();e.hasMoreElements();)
>     {
>       metaTag = (HTMLMetaTag)e.nextElement();
>       metaTag.print();
>     }
>   }
>
>   public static void printLinks(NeuroGridHTMLParser parser)
>   {
>     HTMLLinkTag link;
>     for(Enumeration e =parser.links().elements();e.hasMoreElements();)
>     {
>       link = (HTMLLinkTag)e.nextElement();
>       link.print();
>     }
>   }
> }
>