[Htmlparser-cvs] htmlparser/docs/samples links.html,1.2,1.3
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-09-01 20:48:35
|
Update of /cvsroot/htmlparser/htmlparser/docs/samples In directory sc8-pr-cvs1:/tmp/cvs-serv29574 Modified Files: links.html Log Message: Fix bug #786869 LinkExtractor Sample not working. Index: links.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/samples/links.html,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** links.html 15 Dec 2002 03:41:25 -0000 1.2 --- links.html 1 Sep 2003 20:48:29 -0000 1.3 *************** *** 4,7 **** --- 4,10 ---- <title>Link and Mail Extractor </title> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> + <style type="text/css"> + <!--code { font-family: Courier New, Courier; font-size: 10pt; margin: 0px; }--> + </style> </head> *************** *** 11,134 **** the parserApplications package (in the download bundle, this will be in src.zip).</p> <p> ! <!-- ======================================================== --> ! <!-- = Java Sourcecode to HTML automatically converted code = --> ! <!-- = J2H V2.0 2002 by Markus Gebhard ma...@ja... = --> ! <!-- = Further information: http://www.java2html.de = --> ! </p> <center> ! <table align="center" border="2" cellpadding="3" cellspacing="0" bgcolor="#FFFBF0"> <tr> - <!-- start line numbers --> - <td align="right" valign="top"> <code> <font color="#808080"> 1<br> - 2<br> - 3<br> - 4<br> - 5<br> - 6<br> - 7<br> - 8<br> - 9<br> - 10<br> - 11<br> - 12<br> - 13<br> - 14<br> - 15<br> - 16<br> - <a name="17"></a>17<br> - <a name="18"></a>18<br> - 19<br> - 20<br> - 21<br> - 22<br> - 23<br> - 24<br> - 25<br> - 26<br> - 27<br> - 28<br> - 29<br> - 30<br> - <a name="31"></a>31<br> - 32<br> - 33<br> - 34<br> - 35<br> - 36<br> - 37<br> - 38<br> - 39<br> - 40<br> - 41<br> - 42<br> - 43<br> - 44<br> - 45<br> - 46<br> - 47<br> - 48<br> - 49<br> - 50<br> - </font> </code> </td> - <!-- end line numbers --> <!-- start source code --> ! <td valign="top"> <code> <font color="#0000c0">import </font><font color="#000000"></font><font color="#000000">org.htmlparser.HTMLNode;<br> ! </font><font color="#0000c0">import </font><font color="#000000">org.htmlparser.HTMLParser;<br> ! </font><font color="#0000c0">import </font><font color="#000000">org.htmlparser.tags.HTMLLinkTag;<br> ! </font><font color="#0000c0">import </font><font color="#000000">org.htmlparser.util.HTMLEnumeration;<br> ! </font><font color="#0000c0">import </font><font color="#000000">org.htmlparser.util.HTMLParserException;<br> ! <br> ! </font><font color="#008000">/**<br> ! * LinkExtractor extracts all the links from the given webpage<br> ! * and prints them on standard output.<br> ! */<br> ! </font><font color="#0000c0">public class </font><font color="#000000">LinkExtractor </font><font color="#000000">{<br> ! </font><font color="#0000c0">private </font><font color="#000000">String location;<br> ! </font><font color="#0000c0">private </font><font color="#000000">HTMLParser parser;<br> ! </font><font color="#0000c0">public </font><font color="#000000">LinkExtractor</font><font color="#000000">(</font><font color="#000000">String location</font><font color="#000000">) {<br> ! </font><font color="#0000c0">this</font><font color="#000000">.location = location;<br> ! </font><font color="#0000c0">try </font><font color="#000000">{<br> ! </font><font color="#0000c0">this</font><font color="#000000">.parser = </font><font color="#0000c0">new </font><font color="#000000">HTMLParser</font><font color="#000000">(</font><font color="#000000">location</font><font color="#000000">)</font><font color="#000000">; </font><font color="#008000">// Create the parser object<br> ! </font><font color="#000000">parser.registerScanners</font><font color="#000000">()</font><font color="#000000">; </font><font color="#008000">// Register standard scanners (Very Important)<br> ! </font><font color="#000000">}<br> ! </font><font color="#0000c0">catch </font><font color="#000000">(</font><font color="#000000">HTMLParserException e</font><font color="#000000">) {<br> ! </font><font color="#000000">e.printStackTrace</font><font color="#000000">()</font><font color="#000000">;<br> ! </font><font color="#000000">}<br> ! <br> ! }<br> ! </font><font color="#0000c0">public </font><font color="#c00000">void </font><font color="#000000">extractLinks</font><font color="#000000">() </font><font color="#0000c0">throws </font><font color="#000000">HTMLParserException </font><font color="#000000">{<br> ! </font><font color="#000000">HTMLNode node;<br> ! HTMLLinkTag linkTag;<br> ! System.out.println</font><font color="#000000">(</font><font color="#990000">"Parsing "</font><font color="#000000">+location+</font><font color="#990000">" for links..."</font><font color="#000000">)</font><font color="#000000">;<br> ! </font><font color="#0000c0">for </font><font color="#000000">(</font><font color="#000000">HTMLEnumeration e = parser.elements</font><font color="#000000">()</font><font color="#000000">; ! e.hasMoreNodes</font><font color="#000000">()</font><font color="#000000">;</font><font color="#000000">) {<br> ! </font><font color="#000000">node = e.nextHTMLNode</font><font color="#000000">()</font><font color="#000000">; </font><font color="#008000">// Get the next HTML Node<br> ! </font><font color="#0000c0">if </font><font color="#000000">(</font><font color="#000000">node </font><font color="#0000c0">instanceof </font><font color="#000000">HTMLLinkTag</font><font color="#000000">) {<br> ! </font><font color="#000000">linkTag = </font><font color="#000000">(</font><font color="#000000">HTMLLinkTag</font><font color="#000000">)</font><font color="#000000">node; </font><font color="#008000">// Downcast to a Link Tag<br> ! </font><font color="#000000">linkTag.print</font><font color="#000000">()</font><font color="#000000">; </font><font color="#008000">// Print it<br> ! </font><font color="#000000">}<br> ! }<br> ! }<br> ! </font><font color="#0000c0">public static </font><font color="#c00000">void </font><font color="#000000">main</font><font color="#000000">(</font><font color="#000000">String</font><font color="#000000">[] </font><font color="#000000">args</font><font color="#000000">) {<br> ! </font><font color="#0000c0">if </font><font color="#000000">(</font><font color="#000000">args.length<</font><font color="#990000">0</font><font color="#000000">) {<br> ! </font><font color="#000000">System.err.println</font><font color="#000000">(</font><font color="#990000">"Syntax Error : Please provide the location(URL or file) to parse"</font><font color="#000000">)</font><font color="#000000">;<br> ! System.exit</font><font color="#000000">(</font><font color="#000000">-</font><font color="#990000">1</font><font color="#000000">)</font><font color="#000000">;<br> ! </font><font color="#000000">}<br> ! </font><font color="#000000">LinkExtractor linkExtractor = </font><font color="#0000c0">new </font><font color="#000000">LinkExtractor</font><font color="#000000">(</font><font color="#000000">args</font><font color="#000000">[</font><font color="#990000">0</font><font color="#000000">])</font><font color="#000000">;<br> ! </font><font color="#0000c0">try </font><font color="#000000">{<br> ! </font><font color="#000000">linkExtractor.extractLinks</font><font color="#000000">()</font><font color="#000000">;<br> ! </font><font color="#000000">}<br> ! </font><font color="#0000c0">catch </font><font color="#000000">(</font><font color="#000000">HTMLParserException e</font><font color="#000000">) {<br> ! </font><font color="#000000">e.printStackTrace</font><font color="#000000">()</font><font color="#000000">;<br> ! </font><font color="#000000">}<br> ! }<br> ! </font><font color="#000000">}</font></code> </td> - </tr> <!-- end source code --> <!-- start J2H link --> <tr> ! <td colspan=2 align=right> <small> <a href="http://www.java2html.de" target="_blank">Java2html</a> --- 14,85 ---- the parserApplications package (in the download bundle, this will be in src.zip).</p> <p> ! ! <!-- ======================================================== --> ! <!-- = Java Sourcecode to HTML automatically converted code = --> ! <!-- = Java to HTML Converter V3.2 2003 by Markus Gebhard ma...@ja... = --> ! <!-- = Further information: http://www.java2html.de = --> <center> ! <table align="center" border="2" cellpadding="3" cellspacing="0" bgcolor="#ffffff"> <tr> <!-- start source code --> ! <td nowrap valign="top" align="left"> ! <code> ! <font color="#808080">01 </font><font color="#0000c0"><b>import </b></font><font color="#000000">org.htmlparser.Node;</font><br> ! <font color="#808080">02 </font><font color="#0000c0"><b>import </b></font><font color="#000000">org.htmlparser.Parser;</font><br> ! <font color="#808080">03 </font><font color="#0000c0"><b>import </b></font><font color="#000000">org.htmlparser.tags.LinkTag;</font><br> ! <font color="#808080">04 </font><font color="#0000c0"><b>import </b></font><font color="#000000">org.htmlparser.util.ParserException;</font><br> ! <font color="#808080">05 </font><font color="#ffffff"></font><br> ! <font color="#808080">06 </font><font color="#008000">/**</font><br> ! <font color="#808080">07 </font><font color="#ffffff"> </font><font color="#008000">* LinkExtractor extracts all the links from the given webpage</font><br> ! <font color="#808080">08 </font><font color="#ffffff"> </font><font color="#008000">* and prints them on standard output.</font><br> ! <font color="#808080">09 </font><font color="#ffffff"> </font><font color="#008000">*/</font><br> ! <font color="#808080">10 </font><font color="#0000c0"><b>public class </b></font><font color="#000000">LinkExtractor </font><font color="#000000">{</font><br> ! <font color="#808080">11 </font><font color="#ffffff"> </font><font color="#0000c0"><b>private </b></font><font color="#000000">String location;</font><br> ! <font color="#808080">12 </font><font color="#ffffff"> </font><font color="#0000c0"><b>private </b></font><font color="#000000">Parser parser;</font><br> ! <font color="#808080">13 </font><font color="#ffffff"> </font><font color="#0000c0"><b>public </b></font><font color="#000000">LinkExtractor</font><font color="#000000">(</font><font color="#000000">String location</font><font color="#000000">) {</font><br> ! <font color="#808080">14 </font><font color="#ffffff"> </font><font color="#0000c0"><b>this</b></font><font color="#000000">.location = location;</font><br> ! <font color="#808080">15 </font><font color="#ffffff"> </font><font color="#0000c0"><b>try </b></font><font color="#000000">{</font><br> ! <font color="#808080"><a name="16">16</a> </font><font color="#ffffff"> </font><font color="#0000c0"><b>this</b></font><font color="#000000">.parser = </font><font color="#0000c0"><b>new </b></font><font color="#000000">Parser</font><font color="#000000">(</font><font color="#000000">location</font><font color="#000000">)</font><font color="#000000">; </font><font color="#008000">// Create the parser object</font><br> ! <font color="#808080"><a name="17">17</a> </font><font color="#ffffff"> </font><font color="#000000">parser.registerScanners</font><font color="#000000">()</font><font color="#000000">; </font><font color="#008000">// Register standard scanners (Very Important)</font><br> ! <font color="#808080">18 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">19 </font><font color="#ffffff"> </font><font color="#0000c0"><b>catch </b></font><font color="#000000">(</font><font color="#000000">ParserException e</font><font color="#000000">) {</font><br> ! <font color="#808080">20 </font><font color="#ffffff"> </font><font color="#000000">e.printStackTrace</font><font color="#000000">()</font><font color="#000000">;</font><br> ! <font color="#808080">21 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">22 </font><font color="#ffffff"> </font><br> ! <font color="#808080">23 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">24 </font><font color="#ffffff"> </font><font color="#0000c0"><b>public </b></font><font color="#c00000"><b>void </b></font><font color="#000000">extractLinks</font><font color="#000000">() </font><font color="#0000c0"><b>throws </b></font><font color="#000000">ParserException </font><font color="#000000">{</font><br> ! <font color="#808080">25 </font><font color="#ffffff"> </font><font color="#000000">System.out.println</font><font color="#000000">(</font><font color="#990000">"Parsing "</font><font color="#000000">+location+</font><font color="#990000">" for links..."</font><font color="#000000">)</font><font color="#000000">;</font><br> ! <font color="#808080"><a name="26">26</a> </font><font color="#ffffff"> </font><font color="#000000">Node </font><font color="#000000">[] </font><font color="#000000">links = parser.extractAllNodesThatAre</font><font color="#000000">(</font><font color="#000000">LinkTag.</font><font color="#0000c0"><b>class</b></font><font color="#000000">)</font><font color="#000000">;</font><br> ! <font color="#808080">27 </font><font color="#ffffff"> </font><font color="#0000c0"><b>for </b></font><font color="#000000">(</font><font color="#c00000"><b>int </b></font><font color="#000000">i = </font><font color="#990000">0</font><font color="#000000">;i < links.length;i++</font><font color="#000000">) {</font><br> ! <font color="#808080">28 </font><font color="#ffffff"> </font><font color="#000000">LinkTag linkTag = </font><font color="#000000">(</font><font color="#000000">LinkTag</font><font color="#000000">)</font><font color="#000000">links</font><font color="#000000">[</font><font color="#000000">i</font><font color="#000000">]</font><font color="#000000">;</font><br> ! <font color="#808080">29 </font><font color="#ffffff"> </font><font color="#008000">// To extract only mail addresses, uncomment the following line</font><br> ! <font color="#808080">30 </font><font color="#ffffff"> </font><font color="#008000">// if (linkTag.isMailLink())</font><br> ! <font color="#808080">31 </font><font color="#ffffff"> </font><font color="#000000">System.out.println</font><font color="#000000">(</font><font color="#000000">linkTag.getLink</font><font color="#000000">())</font><font color="#000000">;</font><br> ! <font color="#808080">32 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">33 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">34 </font><font color="#ffffff"></font><br> ! <font color="#808080">35 </font><font color="#ffffff"> </font><font color="#0000c0"><b>public static </b></font><font color="#c00000"><b>void </b></font><font color="#000000">main</font><font color="#000000">(</font><font color="#000000">String</font><font color="#000000">[] </font><font color="#000000">args</font><font color="#000000">) {</font><br> ! <font color="#808080">36 </font><font color="#ffffff"> </font><font color="#0000c0"><b>if </b></font><font color="#000000">(</font><font color="#000000">args.length != </font><font color="#990000">1</font><font color="#000000">) {</font><br> ! <font color="#808080">37 </font><font color="#ffffff"> </font><font color="#000000">System.err.println</font><font color="#000000">(</font><font color="#990000">"Syntax Error : Please provide the location(URL or file) to parse"</font><font color="#000000">)</font><font color="#000000">;</font><br> ! <font color="#808080">38 </font><font color="#ffffff"> </font><font color="#000000">System.exit</font><font color="#000000">(</font><font color="#000000">-</font><font color="#990000">1</font><font color="#000000">)</font><font color="#000000">;</font><br> ! <font color="#808080">39 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">40 </font><font color="#ffffff"> </font><font color="#000000">LinkExtractor linkExtractor = </font><font color="#0000c0"><b>new </b></font><font color="#000000">LinkExtractor</font><font color="#000000">(</font><font color="#000000">args</font><font color="#000000">[</font><font color="#990000">0</font><font color="#000000">])</font><font color="#000000">;</font><br> ! <font color="#808080">41 </font><font color="#ffffff"> </font><font color="#0000c0"><b>try </b></font><font color="#000000">{</font><br> ! <font color="#808080">42 </font><font color="#ffffff"> </font><font color="#000000">linkExtractor.extractLinks</font><font color="#000000">()</font><font color="#000000">;</font><br> ! <font color="#808080">43 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">44 </font><font color="#ffffff"> </font><font color="#0000c0"><b>catch </b></font><font color="#000000">(</font><font color="#000000">ParserException e</font><font color="#000000">) {</font><br> ! <font color="#808080">45 </font><font color="#ffffff"> </font><font color="#000000">e.printStackTrace</font><font color="#000000">()</font><font color="#000000">;</font><br> ! <font color="#808080">46 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">47 </font><font color="#ffffff"> </font><font color="#000000">}</font><br> ! <font color="#808080">48 </font><font color="#000000">}</font><br> ! <font color="#ffffff"></font><font color="#ffffff"> ! </font></code> </td> <!-- end source code --> + </tr> <!-- start J2H link --> <tr> ! <td align="right"> <small> <a href="http://www.java2html.de" target="_blank">Java2html</a> *************** *** 143,170 **** ! ! <p>Let us look at whats really happening here. In <a href="#17">line 17</a>, we are creating a parser object that will work on a given location. The location ! can be a URL or a file. In <a href="#18">line 18</a>, we ask the parser to register itself with certain standard scanners, that give it the capability to handle ! special tags (like links, images, etc..). If line 18 is not provided, the program will not output any link tags.</p> ! <p>The extractLinks() method is the heart of the program. All it really has a ! loop that utilizes an iterator - called HTMLEnumeration. HTMLEnumeration has ! a really simple interface - hasMoreNodes() and nextHTMLNode(). This is very ! similar to java's own Enumeration (hasMoreElements() and nextElement()). The ! reason of not using Enumeration and instead having our own version of it - is ! that we'd like to save previous CPU time on the downcasts. Since we know that ! we're only dealing with HTMLNode objects, we'd like to directly return them. </p> ! <p>Every call to nextHTMLNode() advances the parser to the next object. Please ! remember that this is a streaming parser, and the elements are coming in real ! time. </p> ! <p>Now, we'd like to filter out all objects except links. <a href="#31">Line 31</a> ! accomplishes this. We check if the node is an instance of HTMLLinkTag. If it ! is, then this is the link object that has just been parsed from the webpage. ! We can make a call to its print() method, or we can print it the way we like, using its interface methods such as getLink() and getLinkText(). Try making ! the following modifications to Line 33.</p> <p> <!-- ======================================================== --> --- 94,112 ---- ! <p>Let us look at whats really happening here. In <a href="#16">line 16</a>, we are creating a parser object that will work on a given location. The location ! can be a URL or a file. In <a href="#17">line 17</a>, we ask the parser to register itself with certain standard scanners, that give it the capability to handle ! special tags (like links, images, etc..). If line 17 is not provided, the program will not output any link tags.</p> ! <p>The extractAllNodesThatAre() method call on <a href="#26">line 26</a> in ! extractLinks() is the heart of the program. As it's name suggests, it traverses ! all the nodes in the page and picks out the ones that match the class object ! provided, in this case a LinkTag class. </p> ! <p>We can now cycle through the array of nodes and print out each link URL. ! We can make a call to the LinkTag print() method, or we can print it the way we like, using its interface methods such as getLink() and getLinkText(). Try making ! the following modifications to Line 31.</p> <p> <!-- ======================================================== --> *************** *** 177,181 **** <tr> <!-- start line numbers --> ! <td align="right" valign="top"> <code> <font color="#808080"> 33<br> </font> </code> --- 119,123 ---- <tr> <!-- start line numbers --> ! <td align="right" valign="top"> <code> <font color="#808080"> 31<br> </font> </code> *************** *** 215,219 **** <tr> <!-- start line numbers --> ! <td align="right" valign="top"> <code> <font color="#808080"> 33<br> </font> </code> --- 157,161 ---- <tr> <!-- start line numbers --> ! <td align="right" valign="top"> <code> <font color="#808080"> 31<br> </font> </code> *************** *** 241,287 **** <!-- = END of automatically generated HTML code = --> <!-- ======================================================== --> ! <p>We can also check if the link is a mailto link, using the method HTMLLinkTag.isMailLink(). ! Suppose we wish to modify the above program to print all the email addresses ! from a webpage, we'd only have to modify line 33 to : </p> ! <p> ! <!-- ======================================================== --> ! <!-- = Java Sourcecode to HTML automatically converted code = --> ! <!-- = J2H V2.0 2002 by Markus Gebhard ma...@ja... = --> ! <!-- = Further information: http://www.java2html.de = --> ! </p> ! <center> ! <table align="center" border="2" cellpadding="3" cellspacing="0" bgcolor="#FFFBF0"> ! <tr> ! <!-- start line numbers --> ! <td align="right" valign="top"> <code> <font color="#808080"> 33<br> ! </font> ! </code> ! </td> ! <!-- end line numbers --> ! <!-- start source code --> ! <td valign="top"> ! <code> ! <font color="#0000c0">if </font><font color="#000000">(</font><font color="#000000">linkTag.isMailLink</font><font color="#000000">()) </font><font color="#000000">System.out.println</font><font color="#000000">(</font><font color="#000000">linkTag.getLink</font><font color="#000000">())</font><font color="#000000">;</font></code> ! ! </td> ! </tr> ! <!-- end source code --> ! <!-- start J2H link --> ! <tr> ! <td colspan=2 align=right> ! <small> ! <a href="http://www.java2html.de" target="_blank">Java2html</a> ! </small> ! </td> ! </tr> ! <!-- end J2H link --> ! </table> ! </center> ! <!-- = END of automatically generated HTML code = --> ! <!-- ======================================================== --> ! <p><strong>Limitations of this program</strong></p> ! <p>It cannot provide embedded links - links that are within other recognized tags ! (like the form tag). To uniformly extract embedded links across all tags, check ! <a href="linksEmbedded.html">Extracting Embedded Links/Images</a>.</p> <p><a href="index.html">Back to Samples</a><br> <a href="../index.html">Back to HTMLParser Home Page</a></p> --- 183,188 ---- <!-- = END of automatically generated HTML code = --> <!-- ======================================================== --> ! <p>We can also check if the link is a mailto link, using the method LinkTag.isMailLink() ! by just uncommenting line 30.</p> <p><a href="index.html">Back to Samples</a><br> <a href="../index.html">Back to HTMLParser Home Page</a></p> |