Re: [Htmlparser-developer] Re: Htmlparser-developer digest, Vol 1 #136 - 3 msgs
Brought to you by:
derrickoswald
|
From: Somik R. <so...@ya...> - 2002-12-12 05:15:28
|
Hi Sam,
The parse() is not being called, but the print() method is. From three
places :
NeurogridHTMLParserTest.printLinks()
NeurogridHTMLParserTest.printMetaTags()
NeurogridHTMLParser.searchForSummaryContents()
If you mask this, output will be as you desire.
Regards,
Somik
----- Original Message -----
From: "Sam Joseph" <ga...@yh...>
To: <htm...@li...>
Sent: Wednesday, December 11, 2002 3:41 PM
Subject: [Htmlparser-developer] Re: Htmlparser-developer digest, Vol 1
#136 - 3 msgs
> Hi Somik,
>
> Sorry that my mails are not attaching to the thread properly. I'm on
> digest, so when I reply to the digest meesage I think a new thread get
> automatically started, and the sourceforge mail interface doesn't let me
> reply directly to your messages
>
> Thanks for your suggestion below. As far as I can see from the code the
> parse method on HTMLParser is not being called. In fact it uses exactly
> the think you describe in your mail. I didn't really write this code.
> It's still basically the NeuroGridHTMLParser that you wrote a while
> back, modified into my coding format.
>
> Please find the code appended to this email. Both the links I have been
> parsing are specified in the NeuroGridHTMLParserTest.java file.
>
> Thanks in advance.
>
> CHEERS> SAM
>
> Somik wrote:
>
> >Sorry, I just saw your other mail again with the
> >output. I see the problem -
> >
> >You must be calling the parse method in
> >HTMLParser.java. That is only a demo. As mentioned in
> >the docs, you should be doing something like :
> >
> >(for HTMLEnumeration e =
> >parser.elements();e.hasMoreNodes();) {
> > HTMLNode node = e.nextHTMLNode();
> > // create summary here
> >}
> >
> >The call to parse has the printing stuff which prints
> >all the details of the nodes (calling node.print()).
> >
> >If this does not help, can you post your complete
> >parsing program ?
> >
>
>
----------------------------------------------------------------------------
----
> /*
> * (c) Copyright 2001 MyCorporation.
> * All Rights Reserved.
> */
> package com.neurogrid.parser;
> /**
> * @version 1.0
> * @author
> */
> public class Summary {
> private String heading;
> private String contents;
> /**
> * Constructor for Summary.
> */
> public Summary(String heading, String contents) {
> this.heading = heading;
> this.contents = contents;
> }
>
> /**
> * Gets the heading.
> * @return Returns a String
> */
> public String getHeading() {
> return heading;
> }
>
> /**
> * Sets the heading.
> * @param heading The heading to set
> */
> public void setHeading(String heading) {
> this.heading = heading;
> }
>
> /**
> * Gets the contents.
> * @return Returns a String
> */
> public String getContents() {
> return contents;
> }
>
> /**
> * Sets the contents.
> * @param contents The contents to set
> */
> public void setContents(String contents) {
> this.contents = contents;
> }
>
> public String toString() {
> String retString;
> if (heading.length()>0) retString = heading+"\n"+contents;
> else retString = contents;
> return retString;
> }
> }
>
----------------------------------------------------------------------------
----
> package com.neurogrid.parser;
>
> /*
> * Copyright (C) 2000 NeuroGrid <sa...@ne...>
> *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version 2
> * of the License, or (at your option) any later version.
> *
> * This program is distributed in the hope that it will be useful,
> * but WITHOUT ANY WARRANTY; without even the implied warranty of
> * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> * GNU General Public License for more details.
> *
> * You should have received a copy of the GNU General Public License
> * along with this program; if not, write to the Free Software
> * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
> *
> * You may find further details about this software at
> * http://www.neurogrid.net/
> */
>
> import junit.framework.*;
>
> // Import log4j classes.
> import org.apache.log4j.Category;
> import org.apache.log4j.BasicConfigurator;
> import org.apache.log4j.PropertyConfigurator;
>
> import org.htmlparser.*;
> import org.htmlparser.tags.*;
> import org.htmlparser.scanners.*;
> import org.htmlparser.util.*;
> import java.util.Enumeration;
> import java.util.Vector;
>
> /**
> * @version 1.0
> * @author
> */
> public class NeuroGridHTMLParser
> {
> private static final String cvsInfo = "$Id:$";
> public static String getCvsInfo()
> {
> return cvsInfo;
> }
>
> private static Category o_cat =
Category.getInstance(NeuroGridHTMLParser.class.getName());
>
> /**
> * initialize the logging system
> *
> * @param p_conf configuration filename
> */
> public static void init(String p_conf)
> {
> BasicConfigurator.configure();
> PropertyConfigurator.configure(p_conf);
> o_cat.info("NeuroGridHTMLParser logging Initialized");
> }
>
> private String o_url;
> private String o_full_text;
> private Vector o_meta_tags;
> private Vector o_link_tags;
> private Summary o_summary;
> private StringBuffer o_summary_heading;
> private StringBuffer o_summary_contents;
> private HTMLParser o_parser = null;
> private boolean o_h1_tag_found = false;
> private boolean o_start_summary_search = false;
> private int o_summary_count = 0;
>
>
> /**
> * This constructor is only to enable test cases.
> * For clients, pls use NeuroGridHTMLParser(String)
> * or NeuroGridHTMLParser(String,boolean)
> *
> * @param p_parser
> */
> public NeuroGridHTMLParser(HTMLParser p_parser)
> throws Exception
> {
> this("",false);
> o_parser = p_parser;
> }
>
> /**
> *
> * @param p_url
> */
> public NeuroGridHTMLParser(String p_url)
> throws Exception
> {
> this(p_url,true);
> }
>
> /**
> *
> * @param p_url
> * @param p_start_parsing
> */
> public NeuroGridHTMLParser(String p_url, boolean p_start_parsing)
> throws Exception
> {
> o_url = p_url;
> o_meta_tags = new Vector();
> o_link_tags = new Vector();
> o_summary_heading = new StringBuffer();
> o_summary_contents = new StringBuffer();
> if (p_start_parsing) parse();
> }
>
> private class BlankHTMLParserFeedback
> implements HTMLParserFeedback
> {
> public void info(String message)
> {
> //System.out.println("INFO: " + message);
> }
>
> public void warning(String message)
> {
> //System.out.println("WARNING: " + message);
> }
>
> public void error(String message, HTMLParserException e)
> {
> //System.out.println("ERROR: " + message);
> e.printStackTrace();
> }
> }
>
>
>
> /**
> * parse the page
> */
> public final void parse()
> throws Exception
> {
> if (o_parser==null)
> o_parser = new HTMLParser(o_url, new BlankHTMLParserFeedback());
>
> o_parser.addScanner(new HTMLMetaTagScanner("-t"));
> o_parser.addScanner(new HTMLLinkScanner("-l"));
> o_parser.addScanner(new HTMLTitleScanner("-a"));
> parseURLForData();
> o_summary = createSummary();
> }
>
> /**
> * parse the URL for data
> */
> private void parseURLForData()
> throws Exception
> {
> HTMLNode x_node;
> for (HTMLEnumeration e = o_parser.elements();e.hasMoreNodes();)
> {
> x_node = e.nextHTMLNode();
> checkForTitle(x_node);
> checkForMetaTag(x_node);
> checkForLinkTag(x_node);
> checkForTag(x_node);
> if(o_h1_tag_found == true)
> {
> o_h1_tag_found = processH1Tag(x_node);
> }
> else
> {
> if (o_start_summary_search)
> {
> searchForSummaryContents(x_node);
> }
> addToFullText(x_node);
> }
>
> }
> }
>
> /**
> * parse the URL for data
> *
> * @param HTMLNode
> */
> protected void checkForTitle(HTMLNode p_node)
> {
> if(p_node instanceof HTMLTitleTag)
> {
> String x_title = ((HTMLTitleTag)p_node).getTitle();
> o_cat.debug("appending title: " + x_title);
> // I think it would be better to do one or the other of H1 and
title.
> //FIXXXXXXXXX
> o_summary_heading.append(x_title+"\n");
> }
> }
>
> /**
> * add this nodes text to the full text
> *
> * @param HTMLNode
> */
> private void addToFullText(HTMLNode p_node)
> {
> if(p_node instanceof HTMLStringNode)
> {
> o_full_text += ((HTMLStringNode)p_node).getText();
> }
> }
>
> /**
> * search for summary contents
> *
> * @param HTMLNode
> */
> private void searchForSummaryContents(HTMLNode p_node)
> {
> if(p_node instanceof HTMLStringNode)
> {
> //o_cat.debug("*** SEARCHING FOR SUMMARY ***");
> p_node.print();
> String x_contents = ((HTMLStringNode)p_node).getText();
> if(x_contents.length()>0 && isAlphabetical(x_contents) &&
!isEmpty(x_contents))
> {
> //o_cat.debug("x_contents = "+x_contents);
> o_summary_count++;
> o_summary_contents.append(x_contents+"\n");
> if(o_summary_count==2)
> {
> o_start_summary_search=false;
> }
> }
> }
> }
>
> /**
> * check if this string is just spaces
> *
> * @param p_text
> *
> * @return boolean
> */
> private boolean isEmpty(String p_text)
> {
> boolean x_empty = true;
> for (int i=0;i<p_text.length();i++)
> {
> if (p_text.charAt(i) != ' ')
> {
> x_empty = false;
> }
> }
> return x_empty;
> }
>
> /**
> * check if this string is alphabetical
> *
> * @param p_text
> *
> * @return boolean
> */
> private boolean isAlphabetical(String p_text)
> {
> char x_ch;
> p_text = p_text.toUpperCase();
> boolean x_return = true;
> for(int i=0;i<p_text.length();i++)
> {
> x_ch = p_text.charAt(i);
> if (!((x_ch>='A' && x_ch <='Z')|| (x_ch==' ' || x_ch=='.' ||
x_ch==',')))
> {
> x_return =false;
> }
> }
> return x_return;
> }
>
> /**
> * check for a tag
> *
> * @param p_node
> */
> private void checkForTag(HTMLNode p_node)
> {
> if(p_node instanceof HTMLTag)
> {
> HTMLTag x_tag = (HTMLTag)p_node;
> checkForH1Tag(x_tag);
> checkForBodyTag(x_tag);
> }
> }
>
> /**
> * check for a body tag
> *
> * @param p_node
> */
> private void checkForBodyTag(HTMLTag p_tag)
> {
> if(p_tag.getText().toUpperCase().indexOf("BODY")!=-1)
> {
> o_start_summary_search = true;
> }
> }
>
> /**
> * check for an H1 tag
> *
> * @param p_node
> */
> private void checkForH1Tag(HTMLTag tag)
> {
> if (tag.getText().toUpperCase().equals("H1"))
> {
> o_h1_tag_found = true;
> }
> }
>
> /**
> * check for a meta tag
> *
> * @param p_node
> */
> private void checkForMetaTag(HTMLNode p_node)
> {
> HTMLMetaTag x_meta_tag;
> if(p_node instanceof HTMLMetaTag)
> {
> x_meta_tag = (HTMLMetaTag) p_node;
> o_meta_tags.addElement(x_meta_tag);
> }
> }
>
> /**
> * check for a link tag
> *
> * @param p_node
> */
> private void checkForLinkTag(HTMLNode p_node)
> {
> HTMLLinkTag x_link_tag;
> if(p_node instanceof HTMLLinkTag)
> {
> x_link_tag = (HTMLLinkTag)p_node;
> o_link_tags.addElement(x_link_tag);
> }
> }
>
> /**
> * process an H1 tag
> *
> * @param p_node
> *
> * @return boolean
> */
> private boolean processH1Tag(HTMLNode p_node)
> {
> boolean x_h1_tag_found = true;
> if(p_node instanceof HTMLStringNode)
> {
> o_summary_heading.append(((HTMLStringNode)p_node).getText());
> o_cat.debug("appending title: " +
((HTMLStringNode)p_node).getText());
> // I think it would be better to do one or the other of H1 and
title.
> //FIXXXXXXXXX
> }
> if(p_node instanceof HTMLEndTag)
> {
> HTMLEndTag x_end_tag =(HTMLEndTag)p_node;
> //o_cat.debug("x_end_tag.toString(): " + x_end_tag.toString());
> //o_cat.debug("x_end_tag.toHTML(): " + x_end_tag.toHTML());
> //o_cat.debug("x_end_tag.toPlainTextString(): " +
x_end_tag.toPlainTextString());
> //o_cat.debug("x_end_tag.getTagName(): " + x_end_tag.getTagName());
> //o_cat.debug("x_end_tag.getText(): " + x_end_tag.getText());
> if(x_end_tag.getTagName().toUpperCase().equals("H1"))
> {
> x_h1_tag_found = false;
> }
> }
> return x_h1_tag_found;
> }
>
>
>
> /**
> * get the Summary
> *
> * @return Summary
> */
> public Summary getSummary()
> {
> return o_summary;
> }
>
> /**
> * get the Full text
> *
> * @return String
> */
> public String getFullText()
> {
> return o_full_text;
> }
>
> /**
> * get a vector of the links
> *
> * @return Vector
> */
> public Vector links()
> {
> return o_link_tags;
> }
>
> /**
> * get a vector of meta tags
> *
> * @return Vector
> */
> public Vector metaTags()
> {
> return o_meta_tags;
> }
>
> /**
> * create a summary
> *
> * @return Summary
> */
> private Summary createSummary()
> {
> return new
Summary(o_summary_heading.toString(),o_summary_contents.toString());
> }
>
>
> /**
> * main
> *
> * @param args
> */
> public static void main(String[] args)
> {
> try
> {
> if (args.length==0)
> {
> o_cat.debug("Syntax:");
> o_cat.debug("java -jar neuroparser.jar URL");
> System.exit(-1);
> }
> o_cat.debug("Parsing "+args[0]+"..");
> o_cat.debug("");
> NeuroGridHTMLParser parser = new NeuroGridHTMLParser(args[0]);
> o_cat.debug("Printing links from "+args[0]);
> o_cat.debug("");
>
> printLinks(parser);
> printMetaTags(args, parser);
> printSummary(parser);
> printFullText(parser);
> }
> catch(Exception e)
> {e.printStackTrace();}
> }
>
> public static void printSummary(NeuroGridHTMLParser parser)
> {
> o_cat.debug("");
> o_cat.debug("Summary");
> o_cat.debug("-------");
> o_cat.debug(parser.getSummary());
> o_cat.debug("");
> }
>
> public static void printFullText(NeuroGridHTMLParser parser)
> {
> o_cat.debug("");
> o_cat.debug("Full Text");
> o_cat.debug("-------");
> o_cat.debug(parser.getFullText());
> o_cat.debug("");
> }
>
> public static void printMetaTags(String[] args, NeuroGridHTMLParser
parser)
> {
> HTMLMetaTag metaTag;
> o_cat.debug("");
> o_cat.debug("Printing metaTags from "+args[0]);
> o_cat.debug("");
> for(Enumeration e = parser.metaTags().elements();e.hasMoreElements();)
> {
> metaTag = (HTMLMetaTag)e.nextElement();
> metaTag.print();
> }
> }
>
> public static void printLinks(NeuroGridHTMLParser parser)
> {
> HTMLLinkTag link;
> for(Enumeration e =parser.links().elements();e.hasMoreElements();)
> {
> link = (HTMLLinkTag)e.nextElement();
> link.print();
> }
> }
> }
>
----------------------------------------------------------------------------
----
> package com.neurogrid.parser;
>
> /*
> * Copyright (C) 2000 NeuroGrid <sa...@ne...>
> *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version 2
> * of the License, or (at your option) any later version.
> *
> * This program is distributed in the hope that it will be useful,
> * but WITHOUT ANY WARRANTY; without even the implied warranty of
> * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> * GNU General Public License for more details.
> *
> * You should have received a copy of the GNU General Public License
> * along with this program; if not, write to the Free Software
> * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
> *
> * You may find further details about this software at
> * http://www.neurogrid.net/
> */
>
> import junit.framework.*;
>
> // Import log4j classes.
> import org.apache.log4j.Category;
> import org.apache.log4j.BasicConfigurator;
> import org.apache.log4j.PropertyConfigurator;
>
> import org.htmlparser.*;
> import org.htmlparser.tags.*;
> import org.htmlparser.scanners.*;
> import java.util.Enumeration;
> import java.util.Vector;
>
>
> /**
> * @version 1.0
> * @author
> */
> public class NeuroGridHTMLParserTest
> extends TestCase
> {
> private static final String cvsInfo = "$Id:$";
> public static String getCvsInfo()
> {
> return cvsInfo;
> }
>
> private static Category o_cat =
Category.getInstance(NeuroGridHTMLParserTest.class.getName());
>
> /**
> * initialize the logging system
> *
> * @param p_conf configuration filename
> */
> public static void init(String p_conf)
> {
> BasicConfigurator.configure();
> PropertyConfigurator.configure(p_conf);
> o_cat.info("NeuroGridHTMLParserTest logging Initialized");
> }
>
> public static void main(String[] args)
> {
> NeuroGridHTMLParserTest.start();
> NeuroGridHTMLParserTest.init(args[0]);
> NeuroGridHTMLParserTest.testStuff();
> }
>
> /**
> * Subclasses must invoke this from their constructor.
> */
> public NeuroGridHTMLParserTest(String p_name)
> {
> super(p_name);
> }
>
> protected void setUp()
> {
> start();
> }
>
> protected static void start()
> {
> try
> {
> NeuroGridHTMLParserTest.init("conf/log4j.properties");
> NeuroGridHTMLParser.init("conf/log4j.properties");
> }
> catch(Exception e){e.printStackTrace();}
> }
>
> /**
> * test some stuff
> */
> public static void testStuff()
> {
> try
> {
> // String x_url =
"http://belle.designwest.com/examples/test04b.html";
> String x_url =
"http://home.att.ne.jp/red/gaijin/tribal-hardware/index.htm";
>
> o_cat.debug("Parsing "+x_url+"..");
> o_cat.debug("");
> NeuroGridHTMLParser parser = new NeuroGridHTMLParser(x_url);
> o_cat.debug("Printing links from "+x_url);
> o_cat.debug("");
>
> printLinks(parser);
> printMetaTags(x_url, parser);
> printSummary(parser);
> printFullText(parser);
> }
> catch(Exception e)
> {e.printStackTrace();}
> }
>
>
> public static void printSummary(NeuroGridHTMLParser parser)
> {
> o_cat.debug("");
> o_cat.debug("Summary");
> o_cat.debug("-------");
> o_cat.debug(parser.getSummary().getHeading());
> o_cat.debug("-------");
> o_cat.debug(parser.getSummary().getContents());
> o_cat.debug("-------");
> o_cat.debug("");
> }
>
> public static void printFullText(NeuroGridHTMLParser parser)
> {
> o_cat.debug("");
> o_cat.debug("Full Text");
> o_cat.debug("-------");
> o_cat.debug(parser.getFullText());
> o_cat.debug("");
> }
>
> public static void printMetaTags(String p_url, NeuroGridHTMLParser
parser)
> {
> HTMLMetaTag metaTag;
> o_cat.debug("");
> o_cat.debug("Printing metaTags from "+p_url);
> o_cat.debug("");
> for(Enumeration e = parser.metaTags().elements();e.hasMoreElements();)
> {
> metaTag = (HTMLMetaTag)e.nextElement();
> metaTag.print();
> }
> }
>
> public static void printLinks(NeuroGridHTMLParser parser)
> {
> HTMLLinkTag link;
> for(Enumeration e =parser.links().elements();e.hasMoreElements();)
> {
> link = (HTMLLinkTag)e.nextElement();
> link.print();
> }
> }
> }
>
|