From: <bi...@us...> - 2010-07-16 20:26:09
|
Revision: 3171 http://archive-access.svn.sourceforge.net/archive-access/?rev=3171&view=rev Author: binzino Date: 2010-07-16 20:26:03 +0000 (Fri, 16 Jul 2010) Log Message: ----------- Local edits of Nutch file to over-ride chatty log messages. Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseUtil.java Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseUtil.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseUtil.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/parse/ParseUtil.java 2010-07-16 20:26:03 UTC (rev 3171) @@ -0,0 +1,139 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +// Nutch Imports +import org.apache.nutch.protocol.Content; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + + +/** + * A Utility class containing methods to simply perform parsing utilities such + * as iterating through a preferred list of {@link Parser}s to obtain + * {@link Parse} objects. + * + * @author mattmann + * @author Jérôme Charron + * @author Sébastien Le Callonnec + */ +public class ParseUtil { + + /* our log stream */ + public static final Log LOG = LogFactory.getLog(ParseUtil.class); + private ParserFactory parserFactory; + + /** + * + * @param conf + */ + public ParseUtil(Configuration conf) { + this.parserFactory = new ParserFactory(conf); + } + + /** + * Performs a parse by iterating through a List of preferred {@link Parser}s + * until a successful parse is performed and a {@link Parse} object is + * returned. If the parse is unsuccessful, a message is logged to the + * <code>WARNING</code> level, and an empty parse is returned. + * + * @param content The content to try and parse. + * @return <key, {@link Parse}> pairs. + * @throws ParseException If no suitable parser is found to perform the parse. + */ + public ParseResult parse(Content content) throws ParseException { + Parser[] parsers = null; + + try { + parsers = this.parserFactory.getParsers(content.getContentType(), + content.getUrl() != null ? content.getUrl():""); + } catch (ParserNotFound e) { + if (LOG.isDebugEnabled()) { + LOG.debug("No suitable parser found when trying to parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + throw new ParseException(e.getMessage()); + } + + ParseResult parseResult = null; + for (int i=0; i<parsers.length; i++) { + if (LOG.isDebugEnabled()) { + LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]"); + } + parseResult = parsers[i].getParse(content); + if (parseResult != null && !parseResult.isEmpty()) + return parseResult; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Unable to successfully parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + return null; + } + + /** + * Method parses a {@link Content} object using the {@link Parser} specified + * by the parameter <code>extId</code>, i.e., the Parser's extension ID. + * If a suitable {@link Parser} is not found, then a <code>WARNING</code> + * level message is logged, and a ParseException is thrown. If the parse is + * uncessful for any other reason, then a <code>WARNING</code> level + * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is + * returned. + * + * @param extId The extension implementation ID of the {@link Parser} to use + * to parse the specified content. + * @param content The content to parse. + * + * @return <key, {@link Parse}> pairs if the parse is successful, otherwise, + * a single <key, <code>ParseStatus.getEmptyParse()</code>> pair. + * + * @throws ParseException If there is no suitable {@link Parser} found + * to perform the parse. + */ + public ParseResult parseByExtensionId(String extId, Content content) + throws ParseException { + Parser p = null; + + try { + p = this.parserFactory.getParserById(extId); + } catch (ParserNotFound e) { + if (LOG.isDebugEnabled()) { + LOG.debug("No suitable parser found when trying to parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + throw new ParseException(e.getMessage()); + } + + ParseResult parseResult = p.getParse(content); + if (parseResult != null && !parseResult.isEmpty()) { + return parseResult; + } else { + if (LOG.isDebugEnabled()) { + LOG.debug("Unable to successfully parse content " + content.getUrl() + + " of type " + content.getContentType()); + } + return null; + } + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |