[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Page.java,1.33,1.34
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2004-03-18 04:13:47
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24099/src/org/htmlparser/lexer Modified Files: Page.java Log Message: Deprecate LinkProcessor. Functionality moved to Page. Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** Page.java 31 Jan 2004 20:51:01 -0000 1.33 --- Page.java 18 Mar 2004 04:04:07 -0000 1.34 *************** *** 36,39 **** --- 36,40 ---- import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; + import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; *************** *** 41,45 **** import org.htmlparser.util.EncodingChangeException; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.ParserException; --- 42,45 ---- *************** *** 75,78 **** --- 75,83 ---- /** + * The base URL for this page. + */ + protected String mBaseUrl; + + /** * The source of characters. */ *************** *** 90,99 **** /** - * The processor of relative links on this page. - * Holds any overridden base HREF. - */ - protected LinkProcessor mProcessor; - - /** * Messages for page not there (404). */ --- 95,98 ---- *************** *** 136,140 **** throw new IllegalArgumentException ("connection cannot be null"); setConnection (connection); ! mProcessor = null; } --- 135,139 ---- throw new IllegalArgumentException ("connection cannot be null"); setConnection (connection); ! mBaseUrl = null; } *************** *** 158,162 **** mConnection = null; mUrl = null; ! mProcessor = null; } --- 157,161 ---- mConnection = null; mUrl = null; ! mBaseUrl = null; } *************** *** 180,184 **** mConnection = null; mUrl = null; ! mProcessor = null; } --- 179,183 ---- mConnection = null; mUrl = null; ! mBaseUrl = null; } *************** *** 397,400 **** --- 396,417 ---- /** + * Gets the baseUrl. + * @return The base URL for this page, or <code>null</code> if not set. + */ + public String getBaseUrl () + { + return (mBaseUrl); + } + + /** + * Sets the baseUrl. + * @param url The base url for this page. + */ + public void setBaseUrl (String url) + { + mBaseUrl = url; + } + + /** * Get the source this page is reading from. */ *************** *** 720,741 **** /** ! * Get the link processor associated with this page. ! * @return The link processor that has the base HREF. */ ! public LinkProcessor getLinkProcessor () { ! if (null == mProcessor) ! mProcessor = new LinkProcessor (); ! ! return (mProcessor); } /** ! * Set the link processor associated with this page. ! * @param processor The new link processor for this page. */ ! public void setLinkProcessor (LinkProcessor processor) { ! mProcessor = processor; } --- 737,824 ---- /** ! * Build a URL from the link and base provided. ! * @param link The (relative) URI. ! * @param base The base URL of the page, either from the <BASE> tag ! * or, if none, the URL the page is being fetched from. ! * @return An absolute URL. */ ! public URL constructUrl (String link, String base) ! throws MalformedURLException { ! String path; ! boolean modified; ! boolean absolute; ! int index; ! URL url; // constructed URL combining relative link and base ! ! url = new URL (new URL (base), link); ! path = url.getFile (); ! modified = false; ! absolute = link.startsWith ("/"); ! if (!absolute) ! { // we prefer to fix incorrect relative links ! // this doesn't fix them all, just the ones at the start ! while (path.startsWith ("/.")) ! { ! if (path.startsWith ("/../")) ! { ! path = path.substring (3); ! modified = true; ! } ! else if (path.startsWith ("/./") || path.startsWith("/.")) ! { ! path = path.substring (2); ! modified = true; ! } ! else ! break; ! } ! } ! // fix backslashes ! while (-1 != (index = path.indexOf ("/\\"))) ! { ! path = path.substring (0, index + 1) + path.substring (index + 2); ! modified = true; ! } ! if (modified) ! url = new URL (url, path); ! ! return (url); } /** ! * Create an absolute URL from a relative link. ! * @param link The reslative portion of a URL. ! * @return The fully qualified URL or the original link if it was absolute ! * already or a failure occured. */ ! public String getAbsoluteURL (String link) { ! String base; ! URL url; ! String ret; ! ! if ((null == link) || ("".equals (link))) ! ret = ""; ! else ! try ! { ! base = getBaseUrl (); ! if (null == base) ! base = getUrl (); ! if (null == base) ! ret = link; ! else ! { ! url = constructUrl (link, base); ! ret = url.toExternalForm (); ! } ! } ! catch (MalformedURLException murle) ! { ! ret = link; ! } ! ! return (ret); } *************** *** 914,1022 **** } } - - // /** - // * The default charset. - // * This should be <code>ISO-8859-1</code>, - // * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 - // * Another alias is "8859_1". - // */ - // protected static final String DEFAULT_CHARSET = "ISO-8859-1"; - // - // /** - // * Trigger for charset detection. - // */ - // protected static final String CHARSET_STRING = "charset"; - // - // - // /** - // * Try and extract the character set from the HTTP header. - // * @param connection The connection with the charset info. - // * @return The character set name to use for this HTML page. - // */ - // protected String getCharacterSet (URLConnection connection) - // { - // final String field = "Content-Type"; - // - // String string; - // String ret; - // - // ret = DEFAULT_CHARSET; - // string = connection.getHeaderField (field); - // if (null != string) - // ret = getCharset (string); - // - // return (ret); - // } - // - // /** - // * Get a CharacterSet name corresponding to a charset parameter. - // * @param content A text line of the form: - // * <pre> - // * text/html; charset=Shift_JIS - // * </pre> - // * which is applicable both to the HTTP header field Content-Type and - // * the meta tag http-equiv="Content-Type". - // * Note this method also handles non-compliant quoted charset directives such as: - // * <pre> - // * text/html; charset="UTF-8" - // * </pre> - // * and - // * <pre> - // * text/html; charset='UTF-8' - // * </pre> - // * @return The character set name to use when reading the input stream. - // * For JDKs that have the Charset class this is qualified by passing - // * the name to findCharset() to render it into canonical form. - // * If the charset parameter is not found in the given string, the default - // * character set is returned. - // * @see ParserHelper#findCharset - // * @see #DEFAULT_CHARSET - // */ - // protected String getCharset(String content) - // { - // int index; - // String ret; - // - // ret = DEFAULT_CHARSET; - // if (null != content) - // { - // index = content.indexOf(CHARSET_STRING); - // - // if (index != -1) - // { - // content = content.substring(index + CHARSET_STRING.length()).trim(); - // if (content.startsWith("=")) - // { - // content = content.substring(1).trim(); - // index = content.indexOf(";"); - // if (index != -1) - // content = content.substring(0, index); - // - // //remove any double quotes from around charset string - // if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ())) - // content = content.substring (1, content.length () - 1); - // - // //remove any single quote from around charset string - // if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ())) - // content = content.substring (1, content.length () - 1); - // - // ret = ParserHelper.findCharset(content, ret); - // // Charset names are not case-sensitive; - // // that is, case is always ignored when comparing charset names. - // if (!ret.equalsIgnoreCase(content)) - // { - // feedback.info ( - // "detected charset \"" - // + content - // + "\", using \"" - // + ret - // + "\""); - // } - // } - // } - // } - // - // return (ret); - // } - // - --- 997,998 ---- |