You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2009-11-05 23:06:45
|
Revision: 2880 http://archive-access.svn.sourceforge.net/archive-access/?rev=2880&view=rev Author: bradtofel Date: 2009-11-05 23:06:38 +0000 (Thu, 05 Nov 2009) Log Message: ----------- INITIAL REV: htmllex code for streaming modification of HTML documents Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ContextResultURIConverterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegatorVisitor.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/StringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AfterBodyStartTagJSPExecRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AttributeModifyingRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/BeforeBodyEndTagJSPExecRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/CommentRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSContentRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSPExecRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/RawNodeRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StaticStringRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StyleContentRule.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseCSSStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseHrefStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BlockCSSStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/IdentityStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/InlineCSSStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/URLStringTransformer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ContextResultURIConverterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ContextResultURIConverterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ContextResultURIConverterFactory.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,40 @@ +/* ContextResultURIConverterFactory + * + * $Id$: + * + * Created on Nov 5, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.replay.html; + +import org.archive.wayback.ResultURIConverter; + +/** + * + * Abstracts creation of specialized ResultURIConverters based on particular + * flags. + * + * @author brad + * + */ +public interface ContextResultURIConverterFactory { + public ResultURIConverter getContextConverter(String flags); +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ContextResultURIConverterFactory.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,146 @@ +/* ReplayParseContext + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html; + +import java.io.OutputStream; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.replay.JSPExecutor; +import org.archive.wayback.util.htmllex.ParseContext; + +public class ReplayParseContext extends ParseContext { + private ContextResultURIConverterFactory uriConverterFactory = null; + private String datespec = null; + private JSPExecutor jspExec = null; + private OutputStream outputStream = null; + private Map<String,ResultURIConverter> converters = null; + private String outputCharset; + private int phase = -1; + + public ReplayParseContext(ContextResultURIConverterFactory uriConverterFactory, + URL baseUrl, String datespec) { + + this.uriConverterFactory = uriConverterFactory; + this.baseUrl = baseUrl; + this.datespec = datespec; + converters = new HashMap<String,ResultURIConverter>(); + } + + public void setPhase(int phase) { + this.phase = phase; + } + public int getPhase() { + return phase; + } + + /** + * @return the converters + */ + public Map<String, ResultURIConverter> getConverters() { + return converters; + } + + /** + * @param converters the converters to set + */ + public void setConverters(Map<String, ResultURIConverter> converters) { + this.converters = converters; + } + public void addConverter(String flag, ResultURIConverter converter) { + converters.put(flag, converter); + } + + + private ResultURIConverter makeConverter(String flags) { + return uriConverterFactory.getContextConverter(flags); + } + public ResultURIConverter getConverter(String flags) { + ResultURIConverter converter = converters.get(flags); + if(converter == null) { + converter = makeConverter(flags); + converters.put(flags,converter); + } + return converter; + } + + public String contextualizeUrl(String url) { + return contextualizeUrl(url,""); + } + public String contextualizeUrl(String url, String flags) { + if(url.startsWith("javascript:")) { + return url; + } + url = super.contextualizeUrl(url); + if(flags == null) { + flags = ""; + } + ResultURIConverter converter = getConverter(flags); + return converter.makeReplayURI(datespec, url); + } + + + /** + * @return the charset + */ + public String getOutputCharset() { + return outputCharset; + } + + /** + * @param outputCharset the outputCharset to set + */ + public void setOutputCharset(String outputCharset) { + this.outputCharset = outputCharset; + } + + /** + * @return the outputStream + */ + public OutputStream getOutputStream() { + return outputStream; + } + + /** + * @param outputStream the outputStream to set + */ + public void setOutputStream(OutputStream outputStream) { + this.outputStream = outputStream; + } + /** + * @return the jspExec + */ + public JSPExecutor getJspExec() { + return jspExec; + } + /** + * @param jspExec the jspExec to set + */ + public void setJspExec(JSPExecutor jspExec) { + this.jspExec = jspExec; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,151 @@ +/* ReplayParseEventDelegator + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; +import java.util.List; + +import org.archive.wayback.util.htmllex.ParseEventHandler; +import org.archive.wayback.util.htmllex.ParseEventDelegator; +import org.archive.wayback.util.htmllex.ParseContext; +import org.htmlparser.Node; + +public class ReplayParseEventDelegator implements ParseEventHandler { + + public static final int PHASE_PRE_MODIFY = 0; + public static final int PHASE_MODIFY = 1; + public static final int PHASE_POST_OUTPUT = 2; + + private ParseEventDelegator preModifyDelegator = null; + private ParseEventDelegator modifyDelegator = null; + private ParseEventDelegator postModifyDelegator = null; + private List<ReplayParseEventDelegatorVisitor> parserVisitors = null; + + protected void emit(ParseContext context, Node node) throws IOException { + ReplayParseContext rContext = (ReplayParseContext) context; + OutputStream out = rContext.getOutputStream(); + // no-op, override to actually output something: + if(out != null) { + String charset = rContext.getOutputCharset(); + String rawHTML = node.toHtml(true); + byte[] bytes = null; + try { + bytes = rawHTML.getBytes(charset); + } catch (UnsupportedEncodingException e) { + bytes = rawHTML.getBytes(); + } + out.write(bytes); + } + } + + + public void init() { + preModifyDelegator = new ParseEventDelegator(); + modifyDelegator = new ParseEventDelegator(); + postModifyDelegator = new ParseEventDelegator(); + if(parserVisitors != null) { + for(ReplayParseEventDelegatorVisitor visitor : parserVisitors) { + visitor.visit(this); + } + } + } + + + public void handleNode(ParseContext pContext, Node node) + throws IOException { + ReplayParseContext context = (ReplayParseContext) pContext; + context.setPhase(PHASE_PRE_MODIFY); + preModifyDelegator.handleNode(context,node); + context.setPhase(PHASE_MODIFY); + modifyDelegator.handleNode(context,node); + emit(context, node); + context.setPhase(PHASE_POST_OUTPUT); + postModifyDelegator.handleNode(context,node); + + + } + + public void handleParseComplete(ParseContext context) throws IOException { + preModifyDelegator.handleParseComplete(context); + modifyDelegator.handleParseComplete(context); + postModifyDelegator.handleParseComplete(context); + } + /** + * @return the preModifyDelegator + */ + public ParseEventDelegator getPreModifyDelegator() { + return preModifyDelegator; + } + + + /** + * @param preModifyDelegator the preModifyDelegator to set + */ + public void setPreModifyDelegator(ParseEventDelegator preModifyDelegator) { + this.preModifyDelegator = preModifyDelegator; + } + + + /** + * @return the modifyDelegator + */ + public ParseEventDelegator getModifyDelegator() { + return modifyDelegator; + } + + + /** + * @param modifyDelegator the modifyDelegator to set + */ + public void setModifyDelegator(ParseEventDelegator modifyDelegator) { + this.modifyDelegator = modifyDelegator; + } + + + /** + * @return the postModifyDelegator + */ + public ParseEventDelegator getPostModifyDelegator() { + return postModifyDelegator; + } + + + /** + * @param postModifyDelegator the postModifyDelegator to set + */ + public void setPostModifyDelegator(ParseEventDelegator postModifyDelegator) { + this.postModifyDelegator = postModifyDelegator; + } + + + /** + * @param parserVisitors the parserVisitors to set + */ + public void setParserVisitors(List<ReplayParseEventDelegatorVisitor> parserVisitors) { + this.parserVisitors = parserVisitors; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegatorVisitor.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegatorVisitor.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegatorVisitor.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,29 @@ +/* ReplayParseEventDelegatorVisitor + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html; + +public interface ReplayParseEventDelegatorVisitor { + public void visit(ReplayParseEventDelegator rules); +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseEventDelegatorVisitor.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/StringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/StringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/StringTransformer.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,29 @@ +/* StringTransformer + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html; + +public interface StringTransformer { + public String transform(ReplayParseContext context, String input); +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/StringTransformer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AfterBodyStartTagJSPExecRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AfterBodyStartTagJSPExecRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AfterBodyStartTagJSPExecRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,114 @@ +/* AfterBodyStartTagJSPExecRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; + +import javax.servlet.ServletException; + +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseEventDelegatorVisitor; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; +import org.htmlparser.Node; +import org.htmlparser.nodes.TagNode; + +/** + * This Rule fires just after the BODY start tag, emitting the result of the + * replay .jsp into the resulting page at that point. + * + * Sounds simple, BUT, it's possible there is no BODY start tag... + * + * In case this happens, we watch *ALL* tags go by, before they've been output, + * and if we see any start tags not of the following types: + * + * html,head,base,link,meta,title,style,script + * + * we emit our content then and there. + * + * We also ensure we don't emit twice by storing a flag in the ParseContext once + * we do emit. + * + * @author brad + * + */ +public class AfterBodyStartTagJSPExecRule extends JSPExecRule +implements ReplayParseEventDelegatorVisitor, OpenTagHandler { + private final String[] okHeadTags = { + "HTML","HEAD","BASE","LINK","META","TITLE","STYLE","SCRIPT","BODY" + }; + private final static String FERRET_DONE_KEY = + AfterBodyStartTagJSPExecRule.class.toString(); + public void visit(ReplayParseEventDelegator rules) { + + rules.getPostModifyDelegator().addOpenTagHandler(this,"BODY"); + rules.getPreModifyDelegator().addOpenTagHandler(this); + } + + public void emit(ReplayParseContext context, Node node) throws IOException { + String found = context.getData(FERRET_DONE_KEY); + if(found == null) { + context.putData(FERRET_DONE_KEY,"1"); + try { + super.emit(context, node); + } catch (ServletException e) { + throw new IOException(e); + } + } + } + + private boolean isNotTagAppearingInHead(TagNode node) { + String thisTag = node.getTagName(); + if(thisTag.startsWith("!")) return false; + for(String tag : okHeadTags) { + if(thisTag.equals(tag)) { + return false; + } + } + return true; + } + + public void handleOpenTagNode(ParseContext pContext, TagNode node) + throws IOException { + ReplayParseContext context = (ReplayParseContext) pContext; + if(context.getData(FERRET_DONE_KEY) == null) { + // we haven't emitted yet: + // are we running in post-emit? + if(context.getPhase() == ReplayParseEventDelegator.PHASE_POST_OUTPUT) { + // emit if it is a body tag: + if(node.getTagName().equals("BODY")) { + emit((ReplayParseContext) context,node); + } + } else { + // must be PHASE_PRE_MODIFY: if it's a body tag, emit now: + if(isNotTagAppearingInHead(node)) { + // and this is a tag that shouldn't be in the HEAD. Emit: + emit((ReplayParseContext) context,node); + } + } + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AfterBodyStartTagJSPExecRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AttributeModifyingRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AttributeModifyingRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AttributeModifyingRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,154 @@ +/* AttributeModifyingRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; + +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseEventDelegatorVisitor; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; +import org.htmlparser.nodes.TagNode; + +public class AttributeModifyingRule implements ReplayParseEventDelegatorVisitor, + OpenTagHandler { + + private String tagName = null; + private String whereAttributeName = null; + private String whereAttributeValue = null; + private String modifyAttributeName = null; + private StringTransformer transformer; + + public void visit(ReplayParseEventDelegator rules) { + if(modifyAttributeName == null) { + throw new RuntimeException("Need modifyAttributeName"); + } + if(tagName == null) { + rules.getModifyDelegator().addOpenTagHandler(this); + } else { + rules.getModifyDelegator().addOpenTagHandler(this, tagName); + } + } + + public void handleOpenTagNode(ParseContext context, TagNode node) + throws IOException { + if(whereAttributeName != null) { + // if matchAttrName is set, make sure it is present: + String nodeAttrVal = node.getAttribute(whereAttributeName); + if(nodeAttrVal == null) { + return; + } + // if the value is specified, too, make sure that matches, as well: + if(whereAttributeValue != null) { + if(!nodeAttrVal.equals(whereAttributeValue)) { + return; + } + } + } + // try to perform the update: + if(modifyAttributeName == null) { + // mis-configuration... this is required: + // TODO: log a warning + return; + } + String nodeVal = node.getAttribute(modifyAttributeName); + if(nodeVal != null) { + String newVal = transformer.transform((ReplayParseContext)context, nodeVal); + node.setAttribute(modifyAttributeName, newVal); + } + } + + /** + * @return the tagName + */ + public String getTagName() { + return tagName; + } + + /** + * @param tagName the tagName to set + */ + public void setTagName(String tagName) { + this.tagName = tagName.toUpperCase(); + } + + /** + * @return the whereAttributeName + */ + public String getWhereAttributeName() { + return whereAttributeName; + } + + /** + * @param whereAttributeName the whereAttributeName to set + */ + public void setWhereAttributeName(String whereAttributeName) { + this.whereAttributeName = whereAttributeName.toUpperCase(); + } + + /** + * @return the whereAttributeValue + */ + public String getWhereAttributeValue() { + return whereAttributeValue; + } + + /** + * @param whereAttributeValue the whereAttributeValue to set + */ + public void setWhereAttributeValue(String whereAttributeValue) { + this.whereAttributeValue = whereAttributeValue; + } + + /** + * @return the modifyAttributeName + */ + public String getModifyAttributeName() { + return modifyAttributeName; + } + + /** + * @param modifyAttribute the modifyAttribute to set + */ + public void setModifyAttributeName(String modifyAttributeName) { + this.modifyAttributeName = modifyAttributeName.toUpperCase(); + } + + /** + * @return the transformer + */ + public StringTransformer getTransformer() { + return transformer; + } + + /** + * @param transformer the transformer to set + */ + public void setTransformer(StringTransformer transformer) { + this.transformer = transformer; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/AttributeModifyingRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/BeforeBodyEndTagJSPExecRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/BeforeBodyEndTagJSPExecRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/BeforeBodyEndTagJSPExecRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,75 @@ +/* BeforeBodyEndTagJSPExecRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; + +import javax.servlet.ServletException; + +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseEventDelegatorVisitor; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.CloseTagHandler; +import org.archive.wayback.util.htmllex.handlers.ParseCompleteHandler; +import org.htmlparser.Node; +import org.htmlparser.nodes.TagNode; + +public class BeforeBodyEndTagJSPExecRule extends JSPExecRule +implements ReplayParseEventDelegatorVisitor, CloseTagHandler, ParseCompleteHandler { + private final static String FERRET_DONE_KEY = + BeforeBodyEndTagJSPExecRule.class.toString(); + + public void visit(ReplayParseEventDelegator rules) { + rules.getPreModifyDelegator().addCloseTagHandler(this); + rules.getPreModifyDelegator().addParseCompleteHandler(this); + } + + public void emit(ReplayParseContext context, Node node) throws IOException { + String found = context.getData(FERRET_DONE_KEY); + if(found == null) { + context.putData(FERRET_DONE_KEY,"1"); + try { + super.emit(context, node); + } catch (ServletException e) { + throw new IOException(e); + } + } + } + + + public void handleCloseTagNode(ParseContext context, TagNode node) + throws IOException { + String tagName = node.getTagName(); + if(tagName.equals("BODY") || tagName.equals("HTML")) { + emit((ReplayParseContext) context,node); + } + } + + public void handleParseComplete(ParseContext context) throws IOException { + emit((ReplayParseContext) context,null); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/BeforeBodyEndTagJSPExecRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/CommentRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/CommentRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/CommentRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,67 @@ +/* CommentRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; +import java.io.OutputStream; + +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseEventDelegatorVisitor; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.CloseTagHandler; +import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; +import org.htmlparser.Node; +import org.htmlparser.nodes.TagNode; + +public class CommentRule implements ReplayParseEventDelegatorVisitor, + OpenTagHandler, CloseTagHandler { + + private final static byte[] startComment = "<!--".getBytes(); + private final static byte[] endComment = "-->".getBytes(); + + public void emit(ReplayParseContext context, Node node) throws IOException { + OutputStream os = context.getOutputStream(); + if(os != null) { + os.write(startComment); + os.write(node.toHtml(true).getBytes()); + os.write(endComment); + } + } + + public void visit(ReplayParseEventDelegator rules) { + rules.getPreModifyDelegator().addOpenTagHandler(this); + rules.getPreModifyDelegator().addCloseTagHandler(this, "A"); + } + + public void handleOpenTagNode(ParseContext context, TagNode node) throws IOException { + emit((ReplayParseContext)context,node); + } + + public void handleCloseTagNode(ParseContext context, TagNode node) + throws IOException { + emit((ReplayParseContext)context,node); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/CommentRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSContentRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSContentRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSContentRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,62 @@ +/* JSContentRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; + +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseEventDelegatorVisitor; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.JSTextHandler; +import org.htmlparser.nodes.TextNode; + +public class JSContentRule implements ReplayParseEventDelegatorVisitor, JSTextHandler { + private StringTransformer transformer; + + public void visit(ReplayParseEventDelegator rules) { + rules.getModifyDelegator().addJSTextHandler(this); + } + + public void handleJSTextNode(ParseContext context, TextNode node) + throws IOException { + node.setText(transformer.transform((ReplayParseContext)context, node.getText())); + } + + /** + * @return the transformer + */ + public StringTransformer getTransformer() { + return transformer; + } + + /** + * @param transformer the transformer to set + */ + public void setTransformer(StringTransformer transformer) { + this.transformer = transformer; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSContentRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSPExecRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSPExecRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSPExecRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,72 @@ +/* JSPExecRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; + +import javax.servlet.ServletException; + +import org.archive.wayback.replay.JSPExecutor; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.htmlparser.Node; + +public class JSPExecRule { + private String jspPath = null; + + public void emit(ReplayParseContext context, Node node) throws ServletException, IOException { + JSPExecutor jspExec = context.getJspExec(); + if(jspExec != null) { + OutputStream os = context.getOutputStream(); + if(os != null) { + String jspResult = jspExec.jspToString(jspPath); + byte[] bytes = null; + try { + bytes = jspResult.getBytes(context.getOutputCharset()); + } catch(UnsupportedEncodingException e) { + e.printStackTrace(); + bytes = jspResult.getBytes(); + } + os.write(bytes); + } + } + } + + /** + * @return the jspPath + */ + public String getJspPath() { + return jspPath; + } + + /** + * @param jspPath the jspPath to set + */ + public void setJspPath(String jspPath) { + this.jspPath = jspPath; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/JSPExecRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/RawNodeRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/RawNodeRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/RawNodeRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,53 @@ +/* RawNodeRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; + +import javax.servlet.ServletException; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.htmlparser.Node; + +public class RawNodeRule { + + public void emit(ReplayParseContext context, Node node) throws ServletException, + IOException { + OutputStream os = context.getOutputStream(); + if(os != null) { + String charset = context.getOutputCharset(); + String rawHTML = node.toHtml(true); + try { + os.write(rawHTML.getBytes(charset)); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + os.write(rawHTML.getBytes()); + } + } + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/RawNodeRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StaticStringRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StaticStringRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StaticStringRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,56 @@ +/* StaticStringRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; +import java.io.OutputStream; + +import javax.servlet.ServletException; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.htmlparser.Node; + +public class StaticStringRule { + public String text; + public void emit(ReplayParseContext context, Node node) throws ServletException, + IOException { + OutputStream os = context.getOutputStream(); + if(os != null) { + os.write(text.getBytes(context.getOutputCharset())); + } + } + /** + * @return the text + */ + public String getText() { + return text; + } + /** + * @param text the text to set + */ + public void setText(String text) { + this.text = text; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StaticStringRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StyleContentRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StyleContentRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StyleContentRule.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,60 @@ +/* StyleContentRule + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.rules; + +import java.io.IOException; + +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseEventDelegatorVisitor; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.CSSTextHandler; +import org.htmlparser.nodes.TextNode; + +public class StyleContentRule implements ReplayParseEventDelegatorVisitor, CSSTextHandler { + private StringTransformer transformer; + + public void visit(ReplayParseEventDelegator rules) { + rules.getModifyDelegator().addCSSTextHandler(this); + } + public void handleCSSTextNode(ParseContext context, TextNode node) + throws IOException { + node.setText(transformer.transform((ReplayParseContext)context, node.getText())); + } + /** + * @return the transformer + */ + public StringTransformer getTransformer() { + return transformer; + } + + /** + * @param transformer the transformer to set + */ + public void setTransformer(StringTransformer transformer) { + this.transformer = transformer; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/rules/StyleContentRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseCSSStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseCSSStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseCSSStringTransformer.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,84 @@ +/* URLStringTransformer + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.transformer; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.wayback.replay.html.ReplayParseContext; + +public abstract class BaseCSSStringTransformer { + // this looks for "url(ZZZ)" + protected static String cssUrlPatString = + "url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)"; +// protected static String cssUrlPatString = +// "url\\s*\\(\\s*([^\\)]*)\\s*\\)"; + + // this looks for various forms of "@import ZZZ" where "ZZZ" may or may not + // have quotes and parenths around it.. + // this regex is not supposed to match the (correct) @import url(ZZZ) form, + // which is handled by the more generic "url(ZZZ)" pattern + protected static String cssImportNoUrlPatString = + "@import\\s+(('[^']+')|(\"[^\"]+\")|(\\('[^']+'\\))|(\\(\"[^\"]+\"\\))|(\\([^)]+\\))|([a-z0-9_.:/\\\\-]+))\\s*;"; + + protected static Pattern cssImportNoUrlPattern = Pattern + .compile(cssImportNoUrlPatString); + + protected static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString); + + protected void patternRewrite(ReplayParseContext context, StringBuilder sb, + Pattern pattern, String flags) { + int idx = 0; + Matcher urlMatcher = pattern.matcher(sb); + while (urlMatcher.find(idx)) { + String url = urlMatcher.group(1); + int origUrlLength = url.length(); + int urlStart = urlMatcher.start(1); + int urlEnd = urlMatcher.end(1); + idx = urlEnd; + if ((url.charAt(0) == '(') + && (url.charAt(origUrlLength-1) == ')')) { + url = url.substring(1, origUrlLength - 1); + urlStart += 1; + origUrlLength -= 2; + } + if (url.charAt(0) == '"') { + url = url.substring(1, origUrlLength - 1); + urlStart += 1; + } else if (url.charAt(0) == '\'') { + url = url.substring(1, origUrlLength - 1); + urlStart += 1; + } else if (url.charAt(0) == '\\') { + url = url.substring(2, origUrlLength - 2); + urlStart += 2; + } + int urlLength = url.length(); + String replayUrl = context.contextualizeUrl(url, flags); + int delta = replayUrl.length() - urlLength; + sb.replace(urlStart, urlStart + urlLength, replayUrl); + idx += delta; + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseCSSStringTransformer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseHrefStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseHrefStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseHrefStringTransformer.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,43 @@ +/* URLStringTransformer + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.transformer; + +import java.net.MalformedURLException; +import java.net.URL; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; + +public class BaseHrefStringTransformer implements StringTransformer { + + public String transform(ReplayParseContext context, String input) { + try { + context.setBaseUrl(new URL(input)); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + return input; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BaseHrefStringTransformer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BlockCSSStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BlockCSSStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BlockCSSStringTransformer.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,41 @@ +/* URLStringTransformer + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.transformer; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; + +public class BlockCSSStringTransformer extends BaseCSSStringTransformer implements StringTransformer { + + public String transform(ReplayParseContext context, String css) { + StringBuilder sb = new StringBuilder(css); + patternRewrite((ReplayParseContext)context, sb,cssUrlPattern, null); + patternRewrite((ReplayParseContext)context, sb,cssImportNoUrlPattern, + "cs_"); +// return "__BCSS__" + sb.toString() + "__BCSS__"; + return sb.toString(); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/BlockCSSStringTransformer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/IdentityStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/IdentityStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/IdentityStringTransformer.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,35 @@ +/* URLStringTransformer + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay.html.transformer; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; + +public class IdentityStringTransformer implements StringTransformer { + + public String transform(ReplayParseContext context, String input) { + return input; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/IdentityStringTransformer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/InlineCSSStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/InlineCSSStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/InlineCSSStringTransformer.java 2009-11-05 23:06:38 UTC (rev 2880) @@ -0,0 +1,38 @@ +/* URLStringTransformer + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of t... [truncated message content] |
From: <bra...@us...> - 2009-11-05 23:02:01
|
Revision: 2879 http://archive-access.svn.sourceforge.net/archive-access/?rev=2879&view=rev Author: bradtofel Date: 2009-11-05 23:01:46 +0000 (Thu, 05 Nov 2009) Log Message: ----------- INITIAL REV: Library which sits on top of htmlparser, enabling SAX stream handling for both indexing, text & link extraction, and streaming modifications to HTML documents. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,84 @@ +/* ContextAwareLexer + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex; + +import org.htmlparser.Node; +import org.htmlparser.lexer.Lexer; +import org.htmlparser.util.ParserException; + +/** + * + * The Lexer that comes with htmlparser does not handle non-escaped HTML + * entities within SCRIPT tags - by default, something like: + * + * <script> + * for(var i=0; i<23; i++) { j+=i; } + * </script> + * + * Can cause the lexer to skip over a large part of the document. Technically, + * the above isn't legit HTML, but of course, folks do stuff like that all the + * time. So, this class uses a ParseContext object, passed in at construction, + * which observes the SCRIPT and STYLE tags, both setting properties on the + * ParseContext, and using that state information to perform a parseCDATA() + * call instead of a nextNode() call at the right time, to try to keep the + * SAX parsing in sync with the document. + * + * @author brad + * + */ +public class ContextAwareLexer extends NodeUtils { + + private Lexer lexer = null; + private ParseContext context = null; + public ContextAwareLexer(Lexer lexer, ParseContext context) { + this.lexer = lexer; + this.context = context; + } + public Node nextNode() throws ParserException { + Node node = null; + if(context.isInJS()) { + node = lexer.parseCDATA(true); + if(node != null) { + context.setInScriptText(true); + context.setInJS(false); + return node; + } + } + context.setInScriptText(false); + node = lexer.nextNode(context.isInJS()); + if(node != null) { + if(isNonEmptyOpenTagNodeNamed(node, SCRIPT_TAG_NAME)) { + context.setInJS(true); + } else if(isCloseTagNodeNamed(node, SCRIPT_TAG_NAME)) { + context.setInJS(false); + } else if(isNonEmptyOpenTagNodeNamed(node, STYLE_TAG_NAME)) { + context.setInCSS(true); + } else if(isCloseTagNodeNamed(node, STYLE_TAG_NAME)) { + context.setInCSS(false); + } + } + return node; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,83 @@ +/* NodeUtils + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex; + +import org.htmlparser.Node; +import org.htmlparser.nodes.RemarkNode; +import org.htmlparser.nodes.TagNode; +import org.htmlparser.nodes.TextNode; + +public class NodeUtils { + public static final String SCRIPT_TAG_NAME = "SCRIPT"; + public static final String STYLE_TAG_NAME = "STYLE"; + + public static boolean isTagNode(Node node) { + return (node instanceof TagNode); + } + public static boolean isTextNode(Node node) { + return (node instanceof TextNode); + } + public static boolean isRemarkNode(Node node) { + return (node instanceof RemarkNode); + } + public static boolean isTagNodeNamed(Node node, String name) { + if(isTagNode(node)) { + TagNode tagNode = (TagNode) node; + String nodeName = tagNode.getTagName(); + return nodeName.equals(name); + } + return false; + } + public static boolean isOpenTagNodeNamed(Node node, String name) { + if(isTagNode(node)) { + TagNode tagNode = (TagNode) node; + if(!tagNode.isEndTag()) { + String nodeName = tagNode.getTagName(); + return nodeName.equals(name); + } + } + return false; + } + public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) { + if(isTagNode(node)) { + TagNode tagNode = (TagNode) node; + if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) { + String nodeName = tagNode.getTagName(); + return nodeName.equals(name); + } + } + return false; + } + public static boolean isCloseTagNodeNamed(Node node, String name) { + if(isTagNode(node)) { + TagNode tagNode = (TagNode) node; + if(tagNode.isEndTag()) { + String nodeName = tagNode.getTagName(); + return nodeName.equals(name); + } + } + return false; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,118 @@ +/* ParseContext + * + * $Id$ + * + * Created on 2:06:46 PM Feb 19, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of test. + * + * test is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * test is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with test; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashMap; + +/** + * Class which tracks the context and state involved with parsing an HTML + * document via SAX events. + * + * Also holds some page URL information, and provides some URL resolving + * functionality. + * + * Lastly, this class exposes a general purpose HashMap<String,String> for use + * by specific applications. + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class ParseContext { + protected URL baseUrl = null; + + private boolean inCSS = false; + private boolean inJS = false; + private boolean inScriptText = false; + private HashMap<String,String> data = null; + + public ParseContext() { + data = new HashMap<String, String>(); + } + public void putData(String key, String value) { + data.put(key, value); + } + public String getData(String key) { + return data.get(key); + } + public void setBaseUrl(URL url) { + baseUrl = url; + } + public String resolve(String url) throws MalformedURLException { + URL tmp = new URL(baseUrl,url); + return tmp.toString(); + } + public String contextualizeUrl(String url) { + if(url.startsWith("javascript:")) { + return url; + } + try { + return resolve(url); + } catch (MalformedURLException e) { + e.printStackTrace(); + return url; + } + } + + /** + * @return the inCSS + */ + public boolean isInCSS() { + return inCSS; + } + /** + * @param inCSS the inCSS to set + */ + public void setInCSS(boolean inCSS) { + this.inCSS = inCSS; + } + /** + * @return the inJS + */ + public boolean isInJS() { + return inJS; + } + /** + * @param inJS the inJS to set + */ + public void setInJS(boolean inJS) { + this.inJS = inJS; + } + + /** + * @return the inScriptText + */ + public boolean isInScriptText() { + return inScriptText; + } + /** + * @param inScriptText the inScriptText to set + */ + public void setInScriptText(boolean inScriptText) { + this.inScriptText = inScriptText; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,255 @@ +/* ParseEventDelegator + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.archive.wayback.util.htmllex.handlers.CSSTextHandler; +import org.archive.wayback.util.htmllex.handlers.CloseTagHandler; +import org.archive.wayback.util.htmllex.handlers.ContentTextHandler; +import org.archive.wayback.util.htmllex.handlers.JSTextHandler; +import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; +import org.archive.wayback.util.htmllex.handlers.ParseCompleteHandler; +import org.archive.wayback.util.htmllex.handlers.RemarkTextHandler; +import org.htmlparser.Node; +import org.htmlparser.nodes.RemarkNode; +import org.htmlparser.nodes.TagNode; +import org.htmlparser.nodes.TextNode; + +/** + * + * This class provides an abstraction between high-level SAX events, and + * application specific low-level SAX event handlers. + * + * Any object which wishes to receive any low-level SAX events is placed in the + * parserVisitors List, and at initialization of this class, each element in + * that list is given an opportunity to register to receive whatever low-level + * SAX events it is interested in. + * + * This class also manages casting of Node objects into more event-specific + * casts, and uses the ParseContext to route specific nodes to the registered + * handlers of each low-level event types. + * + * This class attempts to be efficient about targeting specific TagNodes: + * When registering to receive events, handlers can register for a specific + * tag name, or for the global-tag ("*") name. + * + * As TagNodes are handled, all tag-specific handlers are called, followed by + * all global-tag handlers. + * + * @author brad + */ +public class ParseEventDelegator implements ParseEventHandler { + + public static final String WILDCARD_TAG_NAME = "*"; + + private Map<String,List<CloseTagHandler>> closeTagHandlers = null; + private Map<String,List<OpenTagHandler>> openTagHandlers = null; + private List<CSSTextHandler> cssTextHandlers = null; + private List<JSTextHandler> jsTextHandler = null; + private List<RemarkTextHandler> remarkTextHandler = null; + private List<ContentTextHandler> contentTextHandler = null; + private List<ParseCompleteHandler> parseCompleteHandlers = null; + + private List<ParseEventDelegatorVisitor> parserVisitors = null; + + + public void init() { + if(parserVisitors != null) { + for(ParseEventDelegatorVisitor visitor : parserVisitors) { + visitor.visit(this); + } + } + } + + public void handleNode(ParseContext context, Node node) + throws IOException { + + if(NodeUtils.isRemarkNode(node)) { + RemarkNode remarkNode = (RemarkNode) node; + handleRemarkTextNode(context,remarkNode); + + } else if(NodeUtils.isTextNode(node)) { + TextNode textNode = (TextNode) node; + if(context.isInCSS()) { + handleCSSTextNode(context,textNode); + + } else if(context.isInScriptText()) { + handleJSTextNode(context,textNode); + } else { + handleContentTextNode(context,textNode); + } + } else if(NodeUtils.isTagNode(node)) { + TagNode tagNode = (TagNode) node; + if(tagNode.isEndTag()) { + handleCloseTagNode(context,tagNode); + } else { + // assume start, possibly empty: + handleOpenTagNode(context,tagNode); + } + } else { + throw new IllegalArgumentException("Unknown node type.."); + } + } + + // CLOSE TAG: + public void addCloseTagHandler(CloseTagHandler v) { + addCloseTagHandler(v, WILDCARD_TAG_NAME); + } + public void addCloseTagHandler(CloseTagHandler v, String name) { + if(closeTagHandlers == null) { + closeTagHandlers = new HashMap<String,List<CloseTagHandler>>(); + } + if(!closeTagHandlers.containsKey(name)) { + closeTagHandlers.put(name, new ArrayList<CloseTagHandler>()); + } + closeTagHandlers.get(name).add(v); + } + public void handleCloseTagNode(ParseContext context, TagNode node) throws IOException { + String name = node.getTagName(); + if(closeTagHandlers != null) { + for(String n : new String[]{name,WILDCARD_TAG_NAME}) { + if(closeTagHandlers.containsKey(n)) { + for(CloseTagHandler v : closeTagHandlers.get(n)) { + v.handleCloseTagNode(context,node); + } + } + } + } + } + + // OPEN TAG: + public void addOpenTagHandler(OpenTagHandler v) { + addOpenTagHandler(v, WILDCARD_TAG_NAME); + } + public void addOpenTagHandler(OpenTagHandler v, String name) { + if(openTagHandlers == null) { + openTagHandlers = new HashMap<String,List<OpenTagHandler>>(); + } + if(!openTagHandlers.containsKey(name)) { + openTagHandlers.put(name, new ArrayList<OpenTagHandler>()); + } + openTagHandlers.get(name).add(v); + } + + public void handleOpenTagNode(ParseContext context, TagNode node) throws IOException { + String name = node.getTagName(); + if(openTagHandlers != null) { + for(String n : new String[]{name,WILDCARD_TAG_NAME}) { + if(openTagHandlers.containsKey(n)) { + for(OpenTagHandler v : openTagHandlers.get(n)) { + v.handleOpenTagNode(context,node); + } + } + } + } + } + public void addCSSTextHandler(CSSTextHandler v) { + if(cssTextHandlers == null) { + cssTextHandlers = new ArrayList<CSSTextHandler>(); + } + cssTextHandlers.add(v); + } + public void handleCSSTextNode(ParseContext context, TextNode node) throws IOException { + if(cssTextHandlers != null) { + for(CSSTextHandler v : cssTextHandlers) { + v.handleCSSTextNode(context,node); + } + } + } + public void addJSTextHandler(JSTextHandler v) { + if(jsTextHandler == null) { + jsTextHandler = new ArrayList<JSTextHandler>(); + } + jsTextHandler.add(v); + } + public void handleJSTextNode(ParseContext context, TextNode node) throws IOException { + if(jsTextHandler != null) { + for(JSTextHandler v : jsTextHandler) { + v.handleJSTextNode(context,node); + } + } + } + + public void addRemarkTextHandler(RemarkTextHandler v) { + if(remarkTextHandler == null) { + remarkTextHandler = new ArrayList<RemarkTextHandler>(); + } + remarkTextHandler.add(v); + } + public void handleRemarkTextNode(ParseContext context, RemarkNode node) throws IOException { + if(remarkTextHandler != null) { + for(RemarkTextHandler v : remarkTextHandler) { + v.handleRemarkTextNode(context,node); + } + } + } + + public void addContentTextHandler(ContentTextHandler v) { + if(contentTextHandler == null) { + contentTextHandler = new ArrayList<ContentTextHandler>(); + } + contentTextHandler.add(v); + } + public void handleContentTextNode(ParseContext context, TextNode node) throws IOException { + if(contentTextHandler != null) { + for(ContentTextHandler v : contentTextHandler) { + v.handleContentTextNode(context,node); + } + } + } + + public void addParseCompleteHandler(ParseCompleteHandler v) { + if(parseCompleteHandlers == null) { + parseCompleteHandlers = new ArrayList<ParseCompleteHandler>(); + } + parseCompleteHandlers.add(v); + } + public void handleParseComplete(ParseContext context) throws IOException { + if(parseCompleteHandlers != null) { + for(ParseCompleteHandler v : parseCompleteHandlers) { + v.handleParseComplete(context); + } + } + } + + /** + * @return the parserVisitors + */ + public List<ParseEventDelegatorVisitor> getParserVisitors() { + return parserVisitors; + } + + /** + * @param parserVisitors the parserVisitors to set + */ + public void setParserVisitors(List<ParseEventDelegatorVisitor> parserVisitors) { + this.parserVisitors = parserVisitors; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,42 @@ +/* ParseEventDelegatorVisitor + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex; + + +/** + * + * Common interface to decouple application-specific handlers from the + * ParseEventDelegator object: Any object interested in registering for specific + * low-level events can implement this interface, and can be added to the + * ParseEventDelegator parserVisitors list, and it will be given an opportunity + * to register with the ParseEventDelegator for specific events it is + * interested in. + * + * @author brad + * + */ +public interface ParseEventDelegatorVisitor { + public void visit(ParseEventDelegator rules); +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,43 @@ +/* ParseEventHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex; + +import java.io.IOException; + +import org.htmlparser.Node; + +/** + * General interface used with the ContextAwareLexer to handle high-level SAX + * stream events. See ParseEventDelegator and ParseEventDelegatorVisitor for + * more detailed usage. + * + * @author brad + * + */ +public interface ParseEventHandler { + public void handleNode(ParseContext context, Node node) + throws IOException; + public void handleParseComplete(ParseContext context) throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,35 @@ +/* CSSTextHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; +import org.htmlparser.nodes.TextNode; + +public interface CSSTextHandler { + public void handleCSSTextNode(ParseContext context, TextNode node) + throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,35 @@ +/* CloseTagHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; +import org.htmlparser.nodes.TagNode; + +public interface CloseTagHandler { + public void handleCloseTagNode(ParseContext context, TagNode node) + throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,35 @@ +/* ContentTextHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; +import org.htmlparser.nodes.TextNode; + +public interface ContentTextHandler { + public void handleContentTextNode(ParseContext context, TextNode node) + throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,35 @@ +/* JSTextHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; +import org.htmlparser.nodes.TextNode; + +public interface JSTextHandler { + public void handleJSTextNode(ParseContext context, TextNode node) + throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,35 @@ +/* OpenTagHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; +import org.htmlparser.nodes.TagNode; + +public interface OpenTagHandler { + public void handleOpenTagNode(ParseContext context, TagNode node) + throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,35 @@ +/* ParseCompleteHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; + + +public interface ParseCompleteHandler { + public void handleParseComplete(ParseContext context) + throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879) @@ -0,0 +1,35 @@ +/* RemarkTextHandler + * + * $Id$ + * + * Created on 12:36:59 PM Nov 5, 2009. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.htmllex.handlers; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseContext; +import org.htmlparser.nodes.RemarkNode; + +public interface RemarkTextHandler { + public void handleRemarkTextNode(ParseContext context, RemarkNode node) + throws IOException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 21:47:32
|
Revision: 2878 http://archive-access.svn.sourceforge.net/archive-access/?rev=2878&view=rev Author: bradtofel Date: 2009-11-05 21:47:26 +0000 (Thu, 05 Nov 2009) Log Message: ----------- Removing useless test file. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/partition/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/partition/PartitionerTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/partition/PartitionerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/partition/PartitionerTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/partition/PartitionerTest.java 2009-11-05 21:47:26 UTC (rev 2878) @@ -0,0 +1,91 @@ +package org.archive.wayback.util.partition; + +import java.util.Date; +import java.util.List; +import java.util.TimeZone; + +import org.archive.wayback.util.Timestamp; + +import junit.framework.TestCase; +import org.archive.wayback.util.partition.size.*; + +public class PartitionerTest extends TestCase { + + public void testGetRangeDay() { +// TimeZone.setDefault(TimeZone.getTimeZone("UTC")); +// Partitioner<Date> p = new Partitioner<Date>(new DayPartitionSize()); +// Date start = Timestamp.parseBefore("20070101").getDate(); +// Date end = Timestamp.parseBefore("200701051").getDate(); +// +// List<Partition<Date>> l = p.getRange(start, end); +//// for(Partition<Date> pp : l) { +//// System.out.println("Partition(" + dateToTS(pp.getStart()) +//// + ") - (" + dateToTS(pp.getEnd()) + ")"); +//// } +// assertTrue("P1-1-OK",dateToTS(l.get(0).getStart()).equals("20070101000000")); +// assertTrue("P1-1-OK",dateToTS(l.get(0).getEnd()).equals("20070102000000")); +// +// assertTrue("P1-2-OK",dateToTS(l.get(1).getStart()).equals("20070102000000")); +// assertTrue("P1-2-OK",dateToTS(l.get(1).getEnd()).equals("20070103000000")); +// +// assertTrue("P1-3-OK",dateToTS(l.get(2).getStart()).equals("20070103000000")); +// assertTrue("P1-3-OK",dateToTS(l.get(2).getEnd()).equals("20070104000000")); +// +// assertTrue("P1-4-OK",dateToTS(l.get(3).getStart()).equals("20070104000000")); +// assertTrue("P1-4-OK",dateToTS(l.get(3).getEnd()).equals("20070105000000")); +// +// assertTrue("P1-5-OK",dateToTS(l.get(4).getStart()).equals("20070105000000")); +// assertTrue("P1-5-OK",dateToTS(l.get(4).getEnd()).equals("20070106000000")); +// +// assertTrue( "Size OK",l.size() == 5); + } + + public void testGetRangeMonth() { +// TimeZone.setDefault(TimeZone.getTimeZone("UTC")); +// Partitioner<Date> p = new Partitioner<Date>(new MonthPartitionSize()); +// Date start = Timestamp.parseBefore("200611").getDate(); +// Date end = Timestamp.parseBefore("20070505").getDate(); +// +// List<Partition<Date>> l = p.getRange(start, end); +//// for(Partition<Date> pp : l) { +//// System.out.println("Partition(" + dateToTS(pp.getStart()) +//// + ") - (" + dateToTS(pp.getEnd()) + ")"); +//// } +// assertTrue( "Size OK",l.size() == 7); + } + + public void testGetRangeYear() { +// TimeZone.setDefault(TimeZone.getTimeZone("UTC")); +// Partitioner<Date> p = new Partitioner<Date>(new YearPartitionSize()); +// Date start = Timestamp.parseBefore("200611").getDate(); +// Date end = Timestamp.parseBefore("20070505").getDate(); +// +// List<Partition<Date>> l = p.getRange(start, end); +//// for(Partition<Date> pp : l) { +//// System.out.println("Partition(" + dateToTS(pp.getStart()) +//// + ") - (" + dateToTS(pp.getEnd()) + ")"); +//// } +// assertTrue( "Size OK",l.size() == 2); + } + + + private String dateToTS(Date d) { + return new Timestamp(d).getDateStr(); + } + + public void testGetCentered() { +// Partitioner<Date> p = new Partitioner<Date>(new MonthPartitionSize()); +// Date center = Timestamp.parseBefore("200501").getDate(); +// Date start = Timestamp.parseBefore("200311").getDate(); +// Date end = Timestamp.parseBefore("20070505").getDate(); +// int max = 10; +// +// List<Partition<Date>> l = p.getCentered(center, start, end, max); +//// for(Partition<Date> pp : l) { +//// System.out.println("Partition(" + dateToTS(pp.getStart()) +//// + ") - (" + dateToTS(pp.getEnd()) + ")"); +//// } +// assertTrue( "Size OK",l.size() == 9); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/partition/PartitionerTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 21:24:38
|
Revision: 2877 http://archive-access.svn.sourceforge.net/archive-access/?rev=2877&view=rev Author: bradtofel Date: 2009-11-05 21:24:31 +0000 (Thu, 05 Nov 2009) Log Message: ----------- Test: Svn extracting of revision. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 21:22:19 UTC (rev 2876) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 21:24:31 UTC (rev 2877) @@ -35,7 +35,8 @@ * */ public String foo() { - return CURRENT_SVN_VERSION; + String tmp = CURRENT_SVN_VERSION.substring(5); + return tmp.substring(0,tmp.length()-2); } public static void main(String[] args) { SvnTest s = new SvnTest(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 21:22:28
|
Revision: 2876 http://archive-access.svn.sourceforge.net/archive-access/?rev=2876&view=rev Author: bradtofel Date: 2009-11-05 21:22:19 +0000 (Thu, 05 Nov 2009) Log Message: ----------- Test: Setting svn prop inside string literal. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Property Changed: ---------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 20:57:16 UTC (rev 2875) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 21:22:19 UTC (rev 2876) @@ -30,10 +30,15 @@ * */ public class SvnTest { + public static String CURRENT_SVN_VERSION = "$Rev$"; /** - * @param fooArg + * */ - public static void foo(String fooArg) { - + public String foo() { + return CURRENT_SVN_VERSION; } + public static void main(String[] args) { + SvnTest s = new SvnTest(); + System.out.println("SvnTest version is " + s.foo()); + } } Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java ___________________________________________________________________ Modified: svn:keywords - Id + Id Rev This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 20:57:24
|
Revision: 2875 http://archive-access.svn.sourceforge.net/archive-access/?rev=2875&view=rev Author: bradtofel Date: 2009-11-05 20:57:16 +0000 (Thu, 05 Nov 2009) Log Message: ----------- Test: svn adding 'Id' to svn:keywords Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Property Changed: ---------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 20:52:13 UTC (rev 2874) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 20:57:16 UTC (rev 2875) @@ -1,6 +1,6 @@ /* SvnTest * - * $$Id$$: + * $Id$ : * * Created on Nov 5, 2009. * Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java ___________________________________________________________________ Modified: svn:keywords - + Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 20:52:29
|
Revision: 2874 http://archive-access.svn.sourceforge.net/archive-access/?rev=2874&view=rev Author: bradtofel Date: 2009-11-05 20:52:13 +0000 (Thu, 05 Nov 2009) Log Message: ----------- Test: double '$' Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 20:49:57 UTC (rev 2873) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 20:52:13 UTC (rev 2874) @@ -1,6 +1,6 @@ /* SvnTest * - * $Id$: + * $$Id$$: * * Created on Nov 5, 2009. * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 20:50:03
|
Revision: 2873 http://archive-access.svn.sourceforge.net/archive-access/?rev=2873&view=rev Author: bradtofel Date: 2009-11-05 20:49:57 +0000 (Thu, 05 Nov 2009) Log Message: ----------- test: set svn:properties Property Changed: ---------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java ___________________________________________________________________ Added: svn:keywords + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 20:46:04
|
Revision: 2872 http://archive-access.svn.sourceforge.net/archive-access/?rev=2872&view=rev Author: bradtofel Date: 2009-11-05 20:45:55 +0000 (Thu, 05 Nov 2009) Log Message: ----------- Testing Id expansion... Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-05 20:45:55 UTC (rev 2872) @@ -0,0 +1,39 @@ +/* SvnTest + * + * $Id$: + * + * Created on Nov 5, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.surt; + +/** + * @author brad + * + */ +public class SvnTest { + /** + * @param fooArg + */ + public static void foo(String fooArg) { + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2871 http://archive-access.svn.sourceforge.net/archive-access/?rev=2871&view=rev Author: bradtofel Date: 2009-11-02 22:36:38 +0000 (Mon, 02 Nov 2009) Log Message: ----------- BUGFIX (unreported): attempting to allow wayback.war to execute within jetty. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/StringHttpServletResponseWrapper.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/StringHttpServletResponseWrapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/StringHttpServletResponseWrapper.java 2009-10-30 22:20:15 UTC (rev 2870) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/StringHttpServletResponseWrapper.java 2009-11-02 22:36:38 UTC (rev 2871) @@ -24,9 +24,11 @@ */ package org.archive.wayback.replay; +import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; +import javax.servlet.ServletOutputStream; import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpServletResponseWrapper; @@ -40,6 +42,10 @@ private final static String WRAPPED_CHAR_ENCODING = "UTF-8"; private StringWriter sw = new StringWriter(); private String origEncoding = null; + private static final ServletOutputStream FAKE_OUT = new ServletOutputStream() { + public void write(int b) throws IOException { + } + }; /** * @param response @@ -49,6 +55,13 @@ origEncoding = getCharacterEncoding(); setCharacterEncoding(WRAPPED_CHAR_ENCODING); } + + @Override + public ServletOutputStream getOutputStream() throws IOException { + return FAKE_OUT; + } + + public PrintWriter getWriter() { return new PrintWriter(sw); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-30 22:20:34
|
Revision: 2870 http://archive-access.svn.sourceforge.net/archive-access/?rev=2870&view=rev Author: binzino Date: 2009-10-30 22:20:15 +0000 (Fri, 30 Oct 2009) Log Message: ----------- Fix WAX-68. Add use of optional "versions" file in segments directory to declare which segments are NW 0.10 format. Modified Paths: -------------- tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java Modified: tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java =================================================================== --- tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2009-10-29 00:25:57 UTC (rev 2869) +++ tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2009-10-30 22:20:15 UTC (rev 2870) @@ -24,7 +24,9 @@ import java.io.BufferedReader; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import java.util.Iterator; import org.apache.commons.logging.Log; @@ -123,6 +125,7 @@ } private HashMap segments = new HashMap( ); + private Set oldFormatSegments = null; private boolean perCollection = false; private Summarizer summarizer; @@ -175,6 +178,7 @@ } addRemaps( fs, collectionDir, (Map<String,Segment>) perCollectionSegments ); + checkForOldNutchWAXSegmentFormat( fs, collectionDir ); } else { @@ -188,11 +192,64 @@ if ( ! this.perCollection ) { addRemaps( fs, new Path(segmentsDir), (Map<String,Segment>) segments ); + checkForOldNutchWAXSegmentFormat( fs, new Path(segmentsDir) ); } LOG.info( "segments: " + segments ); } + protected void checkForOldNutchWAXSegmentFormat( FileSystem fs, Path segmentDir ) + throws IOException + { + Path versionsFile = new Path( segmentDir, "versions" ); + + if ( ! fs.exists( versionsFile ) ) + { + LOG.info( "Versions file doesn't exist: " + versionsFile ); + + return ; + } + + InputStream is = fs.open( versionsFile ); + + BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + String fields[] = line.trim( ).split( "\\s+" ); + + if ( fields.length < 2 ) + { + LOG.warn( "Malformed versions line, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + Segment segment = (Segment) segments.get( fields[0] ); + if ( segment == null ) + { + LOG.warn( "Segment doesn't exist: " + fields[0] ); + continue ; + } + + String version = fields[1]; + if ( ! ( "10".equals( version ) || "12".equals( version ) ) ) + { + LOG.warn( "Malformed versions line, invalid version ("+version+"): " + version ); + continue; + } + + LOG.info( "Version: " + fields[0] + " : " + fields[1] ); + + if ( this.oldFormatSegments == null ) + { + this.oldFormatSegments = new HashSet( ); + } + + this.oldFormatSegments.add( segment ); + } + } + protected void addRemaps( FileSystem fs, Path segmentDir, Map<String,Segment> segments ) throws IOException { @@ -205,7 +262,6 @@ return ; } - // InputStream is = segmentRemapFile.getFileSystem( conf ).open( segmentRemapFile ); InputStream is = fs.open( segmentRemapFile ); BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); @@ -241,20 +297,34 @@ } public byte[] getContent(HitDetails details) throws IOException { - return getSegment(details).getContent(getKey(details)); + // return getSegment(details).getContent(getKey(details)); + Segment s = getSegment( details ); + + return s.getContent( getKey( s, details ) ); } public ParseData getParseData(HitDetails details) throws IOException { - return getSegment(details).getParseData(getKey(details)); + //return getSegment(details).getParseData(getKey(details)); + + Segment s = getSegment( details ); + + return s.getParseData( getKey( s, details ) ); } public long getFetchDate(HitDetails details) throws IOException { - return getSegment(details).getCrawlDatum(getKey(details)) - .getFetchTime(); + //return getSegment(details).getCrawlDatum(getKey(details)).getFetchTime(); + + Segment s = getSegment( details ); + + return s.getCrawlDatum( getKey( s, details ) ).getFetchTime( ); } public ParseText getParseText(HitDetails details) throws IOException { - return getSegment(details).getParseText(getKey(details)); + //return getSegment(details).getParseText(getKey(details)); + + Segment s = getSegment( details ); + + return s.getParseText( getKey( s, details ) ); } public Summary getSummary(HitDetails details, Query query) @@ -269,7 +339,7 @@ { try { - ParseText parseText = segment.getParseText(getKey(details)); + ParseText parseText = segment.getParseText(getKey(segment, details)); text = (parseText != null) ? parseText.getText() : ""; } catch ( Exception e ) @@ -380,11 +450,30 @@ } } - private Text getKey(HitDetails details) { + /* + private Text getKey(HitDetails details) + { String url = details.getValue("url") + " " + details.getValue("digest"); return new Text(url); } + */ + private Text getKey( Segment segment, HitDetails details) + { + String key = null; + if ( this.oldFormatSegments != null && + this.oldFormatSegments.contains( segment ) ) + { + key = "c=" + details.getValue("collection") + ",u=" + details.getValue( "url"); + } + else + { + key = details.getValue("url") + " " + details.getValue("digest"); + } + + return new Text(key); + } + public void close() throws IOException { Iterator iterator = segments.values().iterator(); while (iterator.hasNext()) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-29 00:26:08
|
Revision: 2869 http://archive-access.svn.sourceforge.net/archive-access/?rev=2869&view=rev Author: binzino Date: 2009-10-29 00:25:57 +0000 (Thu, 29 Oct 2009) Log Message: ----------- Since the ArchiveParallelReader.java was moved, we don't need this directory tree anymore. Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-29 00:18:05
|
Revision: 2868 http://archive-access.svn.sourceforge.net/archive-access/?rev=2868&view=rev Author: binzino Date: 2009-10-29 00:17:34 +0000 (Thu, 29 Oct 2009) Log Message: ----------- Removed, for now at least. Further investigation needed on NW edits. Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java 2009-10-28 23:43:21 UTC (rev 2867) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/DistributedSearch.java 2009-10-29 00:17:34 UTC (rev 2868) @@ -1,90 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.nutchwax; - -import java.io.IOException; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.ipc.RPC; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.searcher.NutchBean; - -/** - * A command-line wrapper for the Nutch DistributedSearch$Server class - * which adds the NutchBeanModifier.modify() call to be able to handle - * parallel indices as well as other NutchWAX enhancements. - * </p> - * <p> - * Invoked the same as the regular Nutch DistributedSearch$Server, but - * with the NutchWAX package prefix, i.e. - * </p> - * <code> - * $ nutch org.archive.nutchwax.DistributedSearch\$Server 9000 <crawl-dir> - * </code> - */ -public class DistributedSearch -{ - public static final Log LOG = LogFactory.getLog(DistributedSearch.class); - - private DistributedSearch() {} // no public ctor - - /** The search server. */ - public static class Server - { - - private Server() - { - } - - /** Runs a search server. */ - public static void main(String[] args) throws Exception - { - String usage = "DistributedSearch$Server <port> <index dir>"; - - if (args.length == 0 || args.length > 2) - { - System.err.println(usage); - System.exit(-1); - } - - int port = Integer.parseInt(args[0]); - Path directory = new Path(args[1]); - - Configuration conf = NutchConfiguration.create(); - - org.apache.hadoop.ipc.Server server = getServer(conf, directory, port); - server.start(); - server.join(); - } - - static org.apache.hadoop.ipc.Server getServer(Configuration conf, Path directory, int port) throws IOException - { - NutchBean bean = new NutchBean(conf, directory); - - // Modify the NutchBean, adding the WAX enhancements to it. - NutchWaxBean.NutchBeanModifier.modify( bean ); - - int numHandlers = conf.getInt("searcher.num.handlers", 10); - return RPC.getServer(bean, "0.0.0.0", port, numHandlers, true, conf); - } - - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2867 http://archive-access.svn.sourceforge.net/archive-access/?rev=2867&view=rev Author: binzino Date: 2009-10-28 23:43:21 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Initial set of edits to update to Nutch 1.0 changes and still compile with NutchWAX edits. Needs testing and probably more work. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java 2009-10-28 23:07:02 UTC (rev 2866) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java 2009-10-28 23:43:21 UTC (rev 2867) @@ -17,467 +17,93 @@ package org.apache.nutch.searcher; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStreamReader; -import java.lang.reflect.Method; -import java.net.InetSocketAddress; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; -import java.util.StringTokenizer; -import java.util.TreeSet; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.ipc.RPC; -import org.apache.hadoop.ipc.VersionedProtocol; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseText; import org.apache.nutch.util.NutchConfiguration; -/** Implements the search API over IPC connnections. */ +/** Search/summary servers. */ public class DistributedSearch { - public static final Log LOG = LogFactory.getLog(DistributedSearch.class); private DistributedSearch() {} // no public ctor - /** The distributed search protocol. */ - public static interface Protocol - extends Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks, VersionedProtocol { - - /** The name of the segments searched by this node. */ - String[] getSegmentNames(); - } - - /** The search server. */ - public static class Server { - - private Server() {} - - /** Runs a search server. */ + /** Runs a search/summary server. */ + public static class Server { public static void main(String[] args) throws Exception { - String usage = "DistributedSearch$Server <port> <index dir>"; + final String usage = "DistributedSearch$Server <port> <crawl dir>"; if (args.length == 0 || args.length > 2) { System.err.println(usage); System.exit(-1); } - int port = Integer.parseInt(args[0]); - Path directory = new Path(args[1]); + final int port = Integer.parseInt(args[0]); + final Path directory = new Path(args[1]); - Configuration conf = NutchConfiguration.create(); + final Configuration conf = NutchConfiguration.create(); - org.apache.hadoop.ipc.Server server = getServer(conf, directory, port); + final org.apache.hadoop.ipc.Server server = + getServer(conf, directory, port); server.start(); server.join(); } - - static org.apache.hadoop.ipc.Server getServer(Configuration conf, Path directory, int port) throws IOException{ - NutchBean bean = new NutchBean(conf, directory); - int numHandlers = conf.getInt("searcher.num.handlers", 10); + + static org.apache.hadoop.ipc.Server getServer(Configuration conf, + Path directory, int port) throws IOException{ + final NutchBean bean = new NutchBean(conf, directory); + final int numHandlers = conf.getInt("searcher.num.handlers", 10); return RPC.getServer(bean, "0.0.0.0", port, numHandlers, true, conf); } } - /** The search client. */ - public static class Client extends Thread - implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks, - Runnable { - - private InetSocketAddress[] defaultAddresses; - private boolean[] liveServer; - private HashMap segmentToAddress = new HashMap(); - - private boolean running = true; - private Configuration conf; - private boolean perCollection = false; - - private Path file; - private long timestamp; - private FileSystem fs; - - /** Construct a client talking to servers listed in the named file. - * Each line in the file lists a server hostname and port, separated by - * whitespace. - */ - public Client(Path file, Configuration conf) - throws IOException { - this(readConfig(file, conf), conf); - this.file = file; - this.timestamp = fs.getFileStatus(file).getModificationTime(); - } - - private static InetSocketAddress[] readConfig(Path path, Configuration conf) - throws IOException { - FileSystem fs = FileSystem.get(conf); - BufferedReader reader = - new BufferedReader(new InputStreamReader(fs.open(path))); - try { - ArrayList addrs = new ArrayList(); - String line; - while ((line = reader.readLine()) != null) { - StringTokenizer tokens = new StringTokenizer(line); - if (tokens.hasMoreTokens()) { - String host = tokens.nextToken(); - if (tokens.hasMoreTokens()) { - String port = tokens.nextToken(); - addrs.add(new InetSocketAddress(host, Integer.parseInt(port))); - if (LOG.isInfoEnabled()) { - LOG.info("Client adding server " + host + ":" + port); - } - } - } - } - return (InetSocketAddress[]) - addrs.toArray(new InetSocketAddress[addrs.size()]); - } finally { - reader.close(); + public static class IndexServer { + /** Runs a lucene search server. */ + public static void main(String[] args) throws Exception { + final String usage = "DistributedSearch$IndexServer <port> <crawl dir>"; + if (args.length == 0 || args.length > 2) { + System.err.println(usage); + System.exit(-1); } - } - /** Construct a client talking to the named servers. */ - public Client(InetSocketAddress[] addresses, Configuration conf) throws IOException { - this.conf = conf; - this.defaultAddresses = addresses; - this.liveServer = new boolean[addresses.length]; - this.fs = FileSystem.get(conf); + final int port = Integer.parseInt(args[0]); + final Path dir = new Path(args[1]); - this.perCollection = this.conf.getBoolean( "nutchwax.FetchedSegments.perCollection", false ); + final Configuration conf = NutchConfiguration.create(); - updateSegments(); - setDaemon(true); - start(); + final LuceneSearchBean bean = new LuceneSearchBean(conf, + new Path(dir, "pindexes"), + new Path(dir, "index"), new Path(dir, "indexes")); + final org.apache.hadoop.ipc.RPC.Server server = + RPC.getServer(bean, "0.0.0.0", port, 10, false, conf); + server.start(); + server.join(); } - - private static final Method GET_SEGMENTS; - private static final Method SEARCH; - private static final Method DETAILS; - private static final Method SUMMARY; - static { - try { - GET_SEGMENTS = Protocol.class.getMethod - ("getSegmentNames", new Class[] {}); - SEARCH = Protocol.class.getMethod - ("search", new Class[] { Query.class, Integer.TYPE, String.class, - String.class, Boolean.TYPE}); - DETAILS = Protocol.class.getMethod - ("getDetails", new Class[] { Hit.class}); - SUMMARY = Protocol.class.getMethod - ("getSummary", new Class[] { HitDetails.class, Query.class}); - } catch (NoSuchMethodException e) { - throw new RuntimeException(e); - } - } + } - /** - * Check to see if search-servers file has been modified - * - * @throws IOException - */ - public boolean isFileModified() - throws IOException { - - if (file != null) { - long modTime = fs.getFileStatus(file).getModificationTime(); - if (timestamp < modTime) { - this.timestamp = fs.getFileStatus(file).getModificationTime(); - return true; - } - } - - return false; - } - - /** Updates segment names. - * - * @throws IOException - */ - public void updateSegments() throws IOException { - - int liveServers = 0; - int liveSegments = 0; - - if (isFileModified()) { - defaultAddresses = readConfig(file, conf); - } - - // Create new array of flags so they can all be updated at once. - boolean[] updatedLiveServer = new boolean[defaultAddresses.length]; - - // build segmentToAddress map - Object[][] params = new Object[defaultAddresses.length][0]; - String[][] results = - (String[][])RPC.call(GET_SEGMENTS, params, defaultAddresses, this.conf); - - for (int i = 0; i < results.length; i++) { // process results of call - InetSocketAddress addr = defaultAddresses[i]; - String[] segments = results[i]; - if (segments == null) { - updatedLiveServer[i] = false; - if (LOG.isWarnEnabled()) { - LOG.warn("Client: no segments from: " + addr); - } - continue; - } - - for (int j = 0; j < segments.length; j++) { - if (LOG.isTraceEnabled()) { - LOG.trace("Client: segment "+segments[j]+" at "+addr); - } - segmentToAddress.put(segments[j], addr); - } - - updatedLiveServer[i] = true; - liveServers++; - liveSegments += segments.length; - } - - // Now update live server flags. - this.liveServer = updatedLiveServer; - - if (LOG.isInfoEnabled()) { - LOG.info("STATS: "+liveServers+" servers, "+liveSegments+" segments."); - } - } - - /** Return the names of segments searched. */ - public String[] getSegmentNames() { - return (String[]) - segmentToAddress.keySet().toArray(new String[segmentToAddress.size()]); - } - - public Hits search(final Query query, final int numHits, - final String dedupField, final String sortField, - final boolean reverse) throws IOException { - // Get the list of live servers. It would be nice to build this - // list in updateSegments(), but that would create concurrency issues. - // We grab a local reference to the live server flags in case it - // is updated while we are building our list of liveAddresses. - boolean[] savedLiveServer = this.liveServer; - int numLive = 0; - for (int i = 0; i < savedLiveServer.length; i++) { - if (savedLiveServer[i]) - numLive++; - } - InetSocketAddress[] liveAddresses = new InetSocketAddress[numLive]; - int[] liveIndexNos = new int[numLive]; - int k = 0; - for (int i = 0; i < savedLiveServer.length; i++) { - if (savedLiveServer[i]) { - liveAddresses[k] = defaultAddresses[i]; - liveIndexNos[k] = i; - k++; - } - } - - Object[][] params = new Object[liveAddresses.length][5]; - for (int i = 0; i < params.length; i++) { - params[i][0] = query; - params[i][1] = new Integer(numHits); - params[i][2] = dedupField; - params[i][3] = sortField; - params[i][4] = Boolean.valueOf(reverse); - } - Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses, this.conf); - - TreeSet queue; // cull top hits from results - - if (sortField == null || reverse) { - queue = new TreeSet(new Comparator() { - public int compare(Object o1, Object o2) { - return ((Comparable)o2).compareTo(o1); // reverse natural order - } - }); - } else { - queue = new TreeSet(); - } - - long totalHits = 0; - Comparable maxValue = null; - for (int i = 0; i < results.length; i++) { - Hits hits = results[i]; - if (hits == null) continue; - totalHits += hits.getTotal(); - for (int j = 0; j < hits.getLength(); j++) { - Hit h = hits.getHit(j); - if (maxValue == null || - ((reverse || sortField == null) - ? h.getSortValue().compareTo(maxValue) >= 0 - : h.getSortValue().compareTo(maxValue) <= 0)) { - queue.add(new Hit(liveIndexNos[i], h.getIndexDocNo(), - h.getSortValue(), h.getDedupValue())); - if (queue.size() > numHits) { // if hit queue overfull - queue.remove(queue.last()); // remove lowest in hit queue - maxValue = ((Hit)queue.last()).getSortValue(); // reset maxValue - } - } - } - } - return new Hits(totalHits, (Hit[])queue.toArray(new Hit[queue.size()])); - } - - // version for hadoop-0.5.0.jar - public static final long versionID = 1L; - - private Protocol getRemote(Hit hit) throws IOException { - return (Protocol) - RPC.getProxy(Protocol.class, versionID, defaultAddresses[hit.getIndexNo()], conf); - } - - private Protocol getRemote(HitDetails hit) throws IOException { - InetSocketAddress address = - (InetSocketAddress)segmentToAddress.get(hit.getValue("segment")); - return (Protocol)RPC.getProxy(Protocol.class, versionID, address, conf); - } - - public String getExplanation(Query query, Hit hit) throws IOException { - return getRemote(hit).getExplanation(query, hit); - } - - public HitDetails getDetails(Hit hit) throws IOException { - return getRemote(hit).getDetails(hit); - } - - public HitDetails[] getDetails(Hit[] hits) throws IOException { - InetSocketAddress[] addrs = new InetSocketAddress[hits.length]; - Object[][] params = new Object[hits.length][1]; - for (int i = 0; i < hits.length; i++) { - addrs[i] = defaultAddresses[hits[i].getIndexNo()]; - params[i][0] = hits[i]; - } - return (HitDetails[])RPC.call(DETAILS, params, addrs, conf); - } - - - public Summary getSummary(HitDetails hit, Query query) throws IOException { - return getRemote(hit).getSummary(hit, query); - } - - - /* DIFF: Added handling for perCollection segments. Also info - * messages about each hit to help diagnose typical - * deployment errors. - */ - public Summary[] getSummary(HitDetails[] hits, Query query) throws IOException - { - try - { - InetSocketAddress[] addrs = new InetSocketAddress[hits.length]; - Object[][] params = new Object[hits.length][2]; - for (int i = 0; i < hits.length; i++) - { - HitDetails hit = hits[i]; - if ( this.perCollection ) - { - addrs[i] = (InetSocketAddress)segmentToAddress.get(hit.getValue("collection")); - LOG.info( "Hit: " + hit + " addr: " + addrs[i] + " collection:" + hit.getValue("collection") ); - } - else - { - addrs[i] = (InetSocketAddress)segmentToAddress.get(hit.getValue("segment")); - LOG.info( "Hit: " + hit + " addr: " + addrs[i] + " segment:" + hit.getValue("segment") ); - } - params[i][0] = hit; - params[i][1] = query; - } - return (Summary[])RPC.call(SUMMARY, params, addrs, conf); - } - catch ( Exception e ) - { - LOG.warn( "Error getting summaries: ", e ); - return new Summary[hits.length]; - } - } - - public byte[] getContent(HitDetails hit) throws IOException { - return getRemote(hit).getContent(hit); - } - - public ParseData getParseData(HitDetails hit) throws IOException { - return getRemote(hit).getParseData(hit); - } - - public ParseText getParseText(HitDetails hit) throws IOException { - return getRemote(hit).getParseText(hit); - } - - public String[] getAnchors(HitDetails hit) throws IOException { - return getRemote(hit).getAnchors(hit); - } - - public Inlinks getInlinks(HitDetails hit) throws IOException { - return getRemote(hit).getInlinks(hit); - } - - public long getFetchDate(HitDetails hit) throws IOException { - return getRemote(hit).getFetchDate(hit); - } - + public static class SegmentServer { + /** Runs a summary server. */ public static void main(String[] args) throws Exception { - String usage = "DistributedSearch$Client query <host> <port> ..."; - - if (args.length == 0) { + final String usage = + "DistributedSearch$SegmentServer <port> <crawl dir>"; + if (args.length < 2) { System.err.println(usage); - System.exit(-1); + System.exit(1); } - Query query = Query.parse(args[0], NutchConfiguration.create()); - - InetSocketAddress[] addresses = new InetSocketAddress[(args.length-1)/2]; - for (int i = 0; i < (args.length-1)/2; i++) { - addresses[i] = - new InetSocketAddress(args[i*2+1], Integer.parseInt(args[i*2+2])); - } + final Configuration conf = NutchConfiguration.create(); + final int port = Integer.parseInt(args[0]); + final Path segmentsDir = new Path(args[1], "segments"); - Client client = new Client(addresses, NutchConfiguration.create()); - //client.setTimeout(Integer.MAX_VALUE); + final FetchedSegments segments = new FetchedSegments(conf, segmentsDir); - Hits hits = client.search(query, 10, null, null, false); - System.out.println("Total hits: " + hits.getTotal()); - for (int i = 0; i < hits.getLength(); i++) { - System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i))); - } + final org.apache.hadoop.ipc.RPC.Server server = + RPC.getServer(segments, "0.0.0.0", port, conf); + server.start(); + server.join(); } - - public void run() { - while (running){ - try{ - Thread.sleep(10000); - } catch (InterruptedException ie){ - if (LOG.isInfoEnabled()) { - LOG.info("Thread sleep interrupted."); - } - } - try{ - if (LOG.isInfoEnabled()) { - LOG.info("Querying segments from search servers..."); - } - updateSegments(); - } catch (IOException ioe) { - if (LOG.isWarnEnabled()) { LOG.warn("No search servers available!"); } - liveServer = new boolean[defaultAddresses.length]; - } - } - } - - /** - * Stops the watchdog thread. - */ - public void close() { - running = false; - interrupt(); - } - - public boolean[] getLiveServer() { - return liveServer; - } } } \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 23:07:12
|
Revision: 2866 http://archive-access.svn.sourceforge.net/archive-access/?rev=2866&view=rev Author: binzino Date: 2009-10-28 23:07:02 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Initial patch with NutchWAX modifications. Needs much more testing, especially with perCollectionSegments and in master/slave mode. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2009-10-28 22:10:42 UTC (rev 2865) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2009-10-28 23:07:02 UTC (rev 2866) @@ -23,14 +23,20 @@ import java.io.InputStreamReader; import java.io.BufferedReader; -import java.util.HashMap; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; import java.util.Map; -import java.util.Iterator; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; - -import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.nutch.protocol.*; @@ -43,22 +49,91 @@ /** Implements {@link HitSummarizer} and {@link HitContent} for a set of * fetched segments. */ -public class FetchedSegments implements HitSummarizer, HitContent -{ +public class FetchedSegments implements RPCSegmentBean { + public static final Log LOG = LogFactory.getLog(FetchedSegments.class); - private static class Segment implements Closeable { - - private static final Partitioner PARTITIONER = new HashPartitioner(); + public static final long VERSION = 1L; - private FileSystem fs; - private Path segmentDir; + private static final ExecutorService executor = + Executors.newCachedThreadPool(); + private class SummaryTask implements Callable<Summary> { + private final HitDetails details; + private final Query query; + + public SummaryTask(HitDetails details, Query query) { + this.details = details; + this.query = query; + } + + public Summary call() throws Exception { + return getSummary(details, query); + } + } + + /* + private class SegmentUpdater extends Thread { + + @Override + public void run() { + while (true) { + try { + final FileStatus[] fstats = fs.listStatus(segmentsDir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats); + final Iterator<Map.Entry<String, Segment>> i = segments.entrySet().iterator(); + while (i.hasNext()) { + final Map.Entry<String, Segment> entry = i.next(); + final Segment seg = entry.getValue(); + if (!fs.exists(seg.segmentDir)) { + try { + seg.close(); + } catch (final Exception e) { + / * A segment may fail to close + * since it may already be deleted from + * file system. So we just ignore the + * exception and remove the mapping from + * 'segments'. + * / + } finally { + i.remove(); + } + } + } + + if (segmentDirs != null) { + for (final Path segmentDir : segmentDirs) { + segments.putIfAbsent(segmentDir.getName(), + new Segment(fs, segmentDir, conf)); + } + } + + Thread.sleep(60000); + } catch (final InterruptedException e) { + // ignore + } catch (final IOException e) { + // ignore + } + } + } + + } + */ + + private static class Segment implements java.io.Closeable { + + private static final Partitioner<Text, Writable> PARTITIONER = + new HashPartitioner<Text, Writable>(); + + private final FileSystem fs; + private final Path segmentDir; + private MapFile.Reader[] content; private MapFile.Reader[] parseText; private MapFile.Reader[] parseData; private MapFile.Reader[] crawl; - private Configuration conf; + private final Configuration conf; public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException { this.fs = fs; @@ -73,7 +148,7 @@ } return (CrawlDatum)getEntry(crawl, url, new CrawlDatum()); } - + public byte[] getContent(Text url) throws IOException { synchronized (this) { if (content == null) @@ -97,7 +172,7 @@ } return (ParseText)getEntry(parseText, url, new ParseText()); } - + private MapFile.Reader[] getReaders(String subDir) throws IOException { return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf); } @@ -122,16 +197,27 @@ } - private HashMap segments = new HashMap( ); - private boolean perCollection = false; - private Summarizer summarizer; + //private final ConcurrentMap<String, Segment> segments = new ConcurrentHashMap<String, Segment>(); + private final ConcurrentMap segments = new ConcurrentHashMap(); + private boolean perCollection = false; + private final FileSystem fs; + private final Configuration conf; + private final Path segmentsDir; + //private final SegmentUpdater segUpdater; + private final Summarizer summarizer; /** Construct given a directory containing fetcher output. */ - public FetchedSegments(FileSystem fs, String segmentsDir, Configuration conf) throws IOException - { - this.summarizer = new SummarizerFactory(conf).getSummarizer(); + public FetchedSegments(Configuration conf, Path segmentsDir) + throws IOException { + this.conf = conf; + this.fs = FileSystem.get(this.conf); + final FileStatus[] fstats = fs.listStatus(segmentsDir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats); + this.summarizer = new SummarizerFactory(this.conf).getSummarizer(); + this.segmentsDir = segmentsDir; + //this.segUpdater = new SegmentUpdater(); - Path[] segmentDirs = HadoopFSUtil.getPaths( fs.listStatus(new Path(segmentsDir), HadoopFSUtil.getPassDirectoriesFilter(fs)) ); if ( segmentDirs == null ) { LOG.warn( "No segment directories: " + segmentsDir ); @@ -161,7 +247,7 @@ Map perCollectionSegments = (Map) this.segments.get( collectionDir.getName( ) ); if ( perCollectionSegments == null ) { - perCollectionSegments = new HashMap( ); + perCollectionSegments = new ConcurrentHashMap( ); this.segments.put( collectionDir.getName( ), perCollectionSegments ); } @@ -179,18 +265,11 @@ else { Path segmentDir = segmentDirs[i]; - segments.put(segmentDir.getName(), new Segment(fs, segmentDir, conf)); + segments.put(segmentDir.getName(), new Segment(this.fs, segmentDir, this.conf)); } } - - // If we not-doing perCollection segments, process a single - // "remap" file for the "segments" dir. - if ( ! this.perCollection ) - { - addRemaps( fs, new Path(segmentsDir), (Map<String,Segment>) segments ); - } - - LOG.info( "segments: " + segments ); + + // this.segUpdater.start(); } protected void addRemaps( FileSystem fs, Path segmentDir, Map<String,Segment> segments ) @@ -235,100 +314,73 @@ } } - public String[] getSegmentNames() { return (String[])segments.keySet().toArray(new String[segments.size()]); } public byte[] getContent(HitDetails details) throws IOException { - return getSegment(details).getContent(getUrl(details)); + return getSegment(details).getContent(getKey(details)); } public ParseData getParseData(HitDetails details) throws IOException { - return getSegment(details).getParseData(getUrl(details)); + return getSegment(details).getParseData(getKey(details)); } public long getFetchDate(HitDetails details) throws IOException { - return getSegment(details).getCrawlDatum(getUrl(details)) + return getSegment(details).getCrawlDatum(getKey(details)) .getFetchTime(); } public ParseText getParseText(HitDetails details) throws IOException { - return getSegment(details).getParseText(getUrl(details)); + return getSegment(details).getParseText(getKey(details)); } public Summary getSummary(HitDetails details, Query query) throws IOException { - + if (this.summarizer == null) { return new Summary(); } - - String text = ""; - Segment segment = getSegment(details); - if ( segment != null ) - { - try - { - ParseText parseText = segment.getParseText(getUrl(details)); - text = (parseText != null) ? parseText.getText() : ""; - } - catch ( Exception e ) - { - LOG.error( "segment = " + segment.segmentDir, e ); - } - } - else - { - LOG.warn( "No segment for: " + details ); - } + final Segment segment = getSegment(details); + final ParseText parseText = segment.getParseText(getKey(details)); + final String text = (parseText != null) ? parseText.getText() : ""; return this.summarizer.getSummary(text, query); } - - private class SummaryThread extends Thread { - private HitDetails details; - private Query query; - private Summary summary; - private Throwable throwable; + public long getProtocolVersion(String protocol, long clientVersion) + throws IOException { + return VERSION; + } - public SummaryThread(HitDetails details, Query query) { - this.details = details; - this.query = query; + public Summary[] getSummary(HitDetails[] details, Query query) + throws IOException { + final List<Callable<Summary>> tasks = + new ArrayList<Callable<Summary>>(details.length); + for (int i = 0; i < details.length; i++) { + tasks.add(new SummaryTask(details[i], query)); } - public void run() { - try { - this.summary = getSummary(details, query); - } catch (Throwable throwable) { - this.throwable = throwable; - } + List<Future<Summary>> summaries; + try { + summaries = executor.invokeAll(tasks); + } catch (final InterruptedException e) { + throw new RuntimeException(e); } - } - - public Summary[] getSummary(HitDetails[] details, Query query) - throws IOException { - SummaryThread[] threads = new SummaryThread[details.length]; - for (int i = 0; i < threads.length; i++) { - threads[i] = new SummaryThread(details[i], query); - threads[i].start(); - } - - Summary[] results = new Summary[details.length]; - for (int i = 0; i < threads.length; i++) { + final Summary[] results = new Summary[details.length]; + for (int i = 0; i < details.length; i++) { + final Future<Summary> f = summaries.get(i); + Summary summary; try { - threads[i].join(); - } catch (InterruptedException e) { + summary = f.get(); + } catch (final Exception e) { + if (e.getCause() instanceof IOException) { + throw (IOException) e.getCause(); + } throw new RuntimeException(e); } - if (threads[i].throwable instanceof IOException) { - throw (IOException)threads[i].throwable; - } else if (threads[i].throwable != null) { - throw new RuntimeException(threads[i].throwable); - } - results[i] = threads[i].summary; + results[i] = summary; } return results; } @@ -380,19 +432,18 @@ } } - private Text getUrl(HitDetails details) { - String url = details.getValue("orig"); - if (StringUtils.isBlank(url)) { - url = details.getValue("url"); - } + private Text getKey(HitDetails details) + { + String url = details.getValue("url") + " " + details.getValue("digest"); + return new Text(url); } public void close() throws IOException { - Iterator iterator = segments.values().iterator(); + final Iterator<Segment> iterator = segments.values().iterator(); while (iterator.hasNext()) { - ((Segment) iterator.next()).close(); + iterator.next().close(); } } - + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 22:11:10
|
Revision: 2865 http://archive-access.svn.sourceforge.net/archive-access/?rev=2865&view=rev Author: binzino Date: 2009-10-28 22:10:42 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Initial revision. Copied from Nutch source, then modified to have NutchWAX extensions/edits which used to be in NutchWaxBean. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneSearchBean.java trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneSearchBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneSearchBean.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneSearchBean.java 2009-10-28 22:10:42 UTC (rev 2865) @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.ArchiveParallelReader; +import org.apache.lucene.index.MultiReader; + +import org.apache.nutch.indexer.FsDirectory; +import org.apache.nutch.indexer.Indexer; +import org.apache.nutch.util.HadoopFSUtil; + + +public class LuceneSearchBean implements RPCSearchBean { + + public static final long VERSION = 1L; + + private IndexSearcher searcher; + + private FileSystem fs; + + private Configuration conf; + + /** + * Construct in a named directory. + * @param conf + * @param dir + * @throws IOException + */ + public LuceneSearchBean(Configuration conf, Path pindexesDir, Path indexDir, Path indexesDir ) + throws IOException { + this.conf = conf; + this.fs = FileSystem.get(this.conf); + init( pindexesDir, indexDir, indexesDir ); + } + + private void init( Path pindexesDir, Path indexDir, Path indexesDir) + throws IOException { + + IndexReader reader = getIndexReader( pindexesDir ); + + if ( reader != null ) + { + this.searcher = new IndexSearcher( reader, this.conf ); + } + else + { + if (this.fs.exists(indexDir)) { + LOG.info("opening merged index in " + indexDir); + this.searcher = new IndexSearcher(indexDir, this.conf); + } else { + LOG.info("opening indexes in " + indexesDir); + + List<Path> vDirs = new ArrayList<Path>(); + FileStatus[] fstats = fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] directories = HadoopFSUtil.getPaths(fstats); + for(int i = 0; i < directories.length; i++) { + Path indexdone = new Path(directories[i], Indexer.DONE_NAME); + if(fs.isFile(indexdone)) { + vDirs.add(directories[i]); + } + } + + directories = new Path[ vDirs.size() ]; + for(int i = 0; vDirs.size()>0; i++) { + directories[i] = vDirs.remove(0); + } + + this.searcher = new IndexSearcher(directories, this.conf); + } + } + } + + public Hits search(Query query, int numHits, String dedupField, + String sortField, boolean reverse) + throws IOException { + return searcher.search(query, numHits, dedupField, sortField, reverse); + } + + public String getExplanation(Query query, Hit hit) throws IOException { + return searcher.getExplanation(query, hit); + } + + public HitDetails getDetails(Hit hit) throws IOException { + return searcher.getDetails(hit); + } + + public HitDetails[] getDetails(Hit[] hits) throws IOException { + return searcher.getDetails(hits); + } + + public boolean ping() throws IOException { + return true; + } + + public void close() throws IOException { + if (searcher != null) { searcher.close(); } + if (fs != null) { fs.close(); } + } + + public long getProtocolVersion(String protocol, long clientVersion) + throws IOException { + return VERSION; + } + + + private IndexReader getIndexReader( Path pindexesDir ) + throws IOException + { + /* + FileSystem fs = FileSystem.get( conf ); + + Path dir = new Path( conf.get( "searcher.dir", "crawl") ).makeQualified( fs ); + LOG.info( "Looking for Nutch indexes in: " + dir ); + if ( ! fs.exists( dir ) ) + { + LOG.warn( "Directory does not exist: " + dir ); + LOG.warn( "No Nutch indexes will be found and all queries will return no results." ); + + return false; + } + + Path pindexesDir = new Path( dir, "pindexes" ).makeQualified(fs); + */ + + LOG.info( "Looking for NutchWax parallel indexes in: " + pindexesDir ); + if ( ! fs.exists( pindexesDir ) ) + { + LOG.warn( "Parallel indexes directory does not exist: " + pindexesDir ); + + return null; + } + + if ( ! fs.getFileStatus( pindexesDir ).isDir( ) ) + { + LOG.warn( "Parallel indexes directory is not a directory: " + pindexesDir ); + + return null; + } + + FileStatus[] fstats = fs.listStatus(pindexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] indexDirs = HadoopFSUtil.getPaths( fstats ); + + if ( indexDirs.length < 1 ) + { + LOG.info( "No sub-dirs found in parallel indexes directory: " + pindexesDir ); + + return null; + } + + List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length ); + + for ( Path indexDir : indexDirs ) + { + fstats = fs.listStatus( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ); + Path parallelDirs[] = HadoopFSUtil.getPaths( fstats ); + + if ( parallelDirs.length < 1 ) + { + LOG.info( "No sub-directories, skipping: " + indexDir ); + + continue; + } + + ArchiveParallelReader reader = new ArchiveParallelReader( ); + + // Sort the parallelDirs so that we add them in order. Order + // matters to the ParallelReader. + Arrays.sort( parallelDirs ); + + for ( Path p : parallelDirs ) + { + LOG.info( "Adding reader for: " + p ); + reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); + } + + readers.add( reader ); + } + + if ( readers.size( ) == 0 ) + { + LOG.warn( "No parallel indexes in: " + pindexesDir ); + + return null; + } + + MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) ); + + return reader; + } + +} Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java 2009-10-28 22:10:42 UTC (rev 2865) @@ -0,0 +1,507 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.*; +import java.net.InetSocketAddress; +import java.util.*; + +import javax.servlet.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.fs.*; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.parse.*; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.util.NutchConfiguration; + +/** + * One stop shopping for search-related functionality. + * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $ + */ +public class NutchBean +implements SearchBean, SegmentBean, HitInlinks, Closeable { + + public static final Log LOG = LogFactory.getLog(NutchBean.class); + public static final String KEY = "nutchBean"; + +// static { +// LogFormatter.setShowThreadIDs(true); +// } + + private SearchBean searchBean; + private SegmentBean segmentBean; + private final HitInlinks linkDb; + + /** BooleanQuery won't permit more than 32 required/prohibited clauses. We + * don't want to use too many of those. */ + private static final int MAX_PROHIBITED_TERMS = 20; + + private final Configuration conf; + + private final FileSystem fs; + + /** Returns the cached instance in the servlet context. + * @see NutchBeanConstructor*/ + public static NutchBean get(ServletContext app, Configuration conf) throws IOException { + final NutchBean bean = (NutchBean)app.getAttribute(KEY); + return bean; + } + + + /** + * + * @param conf + * @throws IOException + */ + public NutchBean(Configuration conf) throws IOException { + this(conf, null); + } + + /** + * Construct in a named directory. + * + * @param conf + * @param dir + * @throws IOException + */ + public NutchBean(Configuration conf, Path dir) throws IOException { + this.conf = conf; + this.fs = FileSystem.get(this.conf); + if (dir == null) + { + dir = new Path( this.conf.get( "searcher.dir", "crawl" ) ).makeQualified( fs ); + } + + LOG.info( "Looking for Nutch indexes in: " + dir ); + if ( ! fs.exists( dir ) ) + { + LOG.error( "Directory does not exist: " + dir ); + LOG.error( "NutchBean not modified." ); + LOG.error( "No Nutch indexes will be found and all queries will return no results." ); + } + + final Path luceneConfig = new Path( dir, "search-servers.txt" ); + final Path solrConfig = new Path( dir, "solr-servers.txt" ); + final Path segmentConfig = new Path( dir, "segment-servers.txt" ); + + if (fs.exists(luceneConfig) || fs.exists(solrConfig)) { + searchBean = new DistributedSearchBean(conf, luceneConfig, solrConfig); + } else { + final Path pindexesDir = new Path( dir, "pindexes" ); + final Path indexDir = new Path( dir, "index" ); + final Path indexesDir = new Path( dir, "indexes" ); + searchBean = new LuceneSearchBean( conf, pindexesDir, indexDir, indexesDir ); + } + + if (fs.exists(segmentConfig)) { + segmentBean = new DistributedSegmentBean(conf, segmentConfig); + } else if (fs.exists(luceneConfig)) { + segmentBean = new DistributedSegmentBean(conf, luceneConfig); + } else { + segmentBean = new FetchedSegments(conf, new Path(dir, "segments")); + } + + linkDb = new LinkDbInlinks(fs, new Path(dir, "linkdb"), conf); + } + + public static List<InetSocketAddress> readAddresses(Path path, + Configuration conf) throws IOException { + final List<InetSocketAddress> addrs = new ArrayList<InetSocketAddress>(); + for (final String line : readConfig(path, conf)) { + final StringTokenizer tokens = new StringTokenizer(line); + if (tokens.hasMoreTokens()) { + final String host = tokens.nextToken(); + if (tokens.hasMoreTokens()) { + final String port = tokens.nextToken(); + addrs.add(new InetSocketAddress(host, Integer.parseInt(port))); + } + } + } + return addrs; + } + + public static List<String> readConfig(Path path, Configuration conf) + throws IOException { + final FileSystem fs = FileSystem.get(conf); + final BufferedReader reader = + new BufferedReader(new InputStreamReader(fs.open(path))); + try { + final ArrayList<String> addrs = new ArrayList<String>(); + String line; + while ((line = reader.readLine()) != null) { + addrs.add(line); + } + return addrs; + } finally { + reader.close(); + } + } + + public String[] getSegmentNames() throws IOException { + return segmentBean.getSegmentNames(); + } + + public Hits search(Query query, int numHits) throws IOException { + return search(query, numHits, null, null, false); + } + + public Hits search(Query query, int numHits, + String dedupField, String sortField, boolean reverse) + throws IOException { + + return searchBean.search(query, numHits, dedupField, sortField, reverse); + } + + @SuppressWarnings("serial") + private class DupHits extends ArrayList<Hit> { + private boolean maxSizeExceeded; + } + + /** Search for pages matching a query, eliminating excessive hits from the + * same site. Hits after the first <code>maxHitsPerDup</code> from the same + * site are removed from results. The remaining hits have {@link + * Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero then all + * hits are returned. + * + * @param query query + * @param numHits number of requested hits + * @param maxHitsPerDup the maximum hits returned with matching values, or zero + * @return Hits the matching hits + * @throws IOException + */ + public Hits search(Query query, int numHits, int maxHitsPerDup) + throws IOException { + return search(query, numHits, maxHitsPerDup, "site", null, false); + } + + /** Search for pages matching a query, eliminating excessive hits with + * matching values for a named field. Hits after the first + * <code>maxHitsPerDup</code> are removed from results. The remaining hits + * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero + * then all hits are returned. + * + * @param query query + * @param numHits number of requested hits + * @param maxHitsPerDup the maximum hits returned with matching values, or zero + * @param dedupField field name to check for duplicates + * @return Hits the matching hits + * @throws IOException + */ + public Hits search(Query query, int numHits, + int maxHitsPerDup, String dedupField) + throws IOException { + return search(query, numHits, maxHitsPerDup, dedupField, null, false); + } + /** Search for pages matching a query, eliminating excessive hits with + * matching values for a named field. Hits after the first + * <code>maxHitsPerDup</code> are removed from results. The remaining hits + * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero + * then all hits are returned. + * + * @param query query + * @param numHits number of requested hits + * @param maxHitsPerDup the maximum hits returned with matching values, or zero + * @param dedupField field name to check for duplicates + * @param sortField Field to sort on (or null if no sorting). + * @param reverse True if we are to reverse sort by <code>sortField</code>. + * @return Hits the matching hits + * @throws IOException + */ + public Hits search(Query query, int numHits, + int maxHitsPerDup, String dedupField, + String sortField, boolean reverse) + throws IOException { + if (maxHitsPerDup <= 0) // disable dup checking + return search(query, numHits, dedupField, sortField, reverse); + + final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f); + int numHitsRaw = (int)(numHits * rawHitsFactor); + if (LOG.isInfoEnabled()) { + LOG.info("searching for "+numHitsRaw+" raw hits"); + } + Hits hits = searchBean.search(query, numHitsRaw, + dedupField, sortField, reverse); + final long total = hits.getTotal(); + final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>(); + final List<Hit> resultList = new ArrayList<Hit>(); + final Set<Hit> seen = new HashSet<Hit>(); + final List<String> excludedValues = new ArrayList<String>(); + boolean totalIsExact = true; + for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { + // get the next raw hit + if (rawHitNum >= hits.getLength()) { + // optimize query by prohibiting more matches on some excluded values + final Query optQuery = (Query)query.clone(); + for (int i = 0; i < excludedValues.size(); i++) { + if (i == MAX_PROHIBITED_TERMS) + break; + optQuery.addProhibitedTerm(excludedValues.get(i), + dedupField); + } + numHitsRaw = (int)(numHitsRaw * rawHitsFactor); + if (LOG.isInfoEnabled()) { + LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery); + } + hits = searchBean.search(optQuery, numHitsRaw, + dedupField, sortField, reverse); + if (LOG.isInfoEnabled()) { + LOG.info("found "+hits.getTotal()+" raw hits"); + } + rawHitNum = -1; + continue; + } + + final Hit hit = hits.getHit(rawHitNum); + if (seen.contains(hit)) + continue; + seen.add(hit); + + // get dup hits for its value + final String value = hit.getDedupValue(); + DupHits dupHits = dupToHits.get(value); + if (dupHits == null) + dupToHits.put(value, dupHits = new DupHits()); + + // does this hit exceed maxHitsPerDup? + if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit + if (!dupHits.maxSizeExceeded) { + + // mark prior hits with moreFromDupExcluded + for (int i = 0; i < dupHits.size(); i++) { + dupHits.get(i).setMoreFromDupExcluded(true); + } + dupHits.maxSizeExceeded = true; + + excludedValues.add(value); // exclude dup + } + totalIsExact = false; + } else { // no -- collect the hit + resultList.add(hit); + dupHits.add(hit); + + // are we done? + // we need to find one more than asked for, so that we can tell if + // there are more hits to be shown + if (resultList.size() > numHits) + break; + } + } + + final Hits results = + new Hits(total, + resultList.toArray(new Hit[resultList.size()])); + results.setTotalIsExact(totalIsExact); + return results; + } + + + public String getExplanation(Query query, Hit hit) throws IOException { + return searchBean.getExplanation(query, hit); + } + + public HitDetails getDetails(Hit hit) throws IOException { + return searchBean.getDetails(hit); + } + + public HitDetails[] getDetails(Hit[] hits) throws IOException { + return searchBean.getDetails(hits); + } + + public Summary getSummary(HitDetails hit, Query query) throws IOException { + return segmentBean.getSummary(hit, query); + } + + public Summary[] getSummary(HitDetails[] hits, Query query) + throws IOException { + return segmentBean.getSummary(hits, query); + } + + public byte[] getContent(HitDetails hit) throws IOException { + return segmentBean.getContent(hit); + } + + public ParseData getParseData(HitDetails hit) throws IOException { + return segmentBean.getParseData(hit); + } + + public ParseText getParseText(HitDetails hit) throws IOException { + return segmentBean.getParseText(hit); + } + + public String[] getAnchors(HitDetails hit) throws IOException { + return linkDb.getAnchors(hit); + } + + public Inlinks getInlinks(HitDetails hit) throws IOException { + return linkDb.getInlinks(hit); + } + + public long getFetchDate(HitDetails hit) throws IOException { + return segmentBean.getFetchDate(hit); + } + + public void close() throws IOException { + if (searchBean != null) { searchBean.close(); } + if (segmentBean != null) { segmentBean.close(); } + if (linkDb != null) { linkDb.close(); } + if (fs != null) { fs.close(); } + } + + public boolean ping() { + return true; + } + + /** For debugging. */ + public static void main(String[] args) throws Exception { + + String usage = "NutchWaxBean [options] query" + + "\n\t-h <n> Hits per site" + + "\n\t-n <n> Number of results to find" + + "\n\t-d <dir> Search directory" + + "\n"; + + if ( args.length == 0 ) + { + System.err.println( usage ); + System.exit( -1 ); + } + + String queryString = args[args.length - 1]; + String searchDir = null; + int hitsPerSite = 0; + int numHits = 10; + for ( int i = 0 ; i < args.length - 1 ; i++ ) + { + try + { + if ( "-h".equals( args[i] ) ) + { + i++; + hitsPerSite = Integer.parseInt( args[i] ); + } + if ( "-n".equals( args[i] ) ) + { + i++; + numHits = Integer.parseInt( args[i] ); + } + if ( "-d".equals( args[i] ) ) + { + i++; + searchDir = args[i]; + } + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Error: not a numeric value: " + args[i] ); + System.err.println( usage ); + System.exit( -1 ); + } + } + + final Configuration conf = NutchConfiguration.create(); + + if ( searchDir != null ) + { + conf.set( "searcher.dir", searchDir ); + } + System.out.println( "Searching in directory: " + conf.get( "searcher.dir" ) ); + System.out.println( "Hits per site: " + hitsPerSite ); + + final NutchBean bean = new NutchBean(conf); + + try { + final Query query = Query.parse( queryString, conf); + final Hits hits = bean.search(query, 10); + System.out.println( "Total hits : " + hits.getTotal () ); + System.out.println( "Hits length: " + hits.getLength() ); + final int length = (int)Math.min(hits.getTotal(), 10); + final Hit[] show = hits.getHits(0, length); + final HitDetails[] details = bean.getDetails(show); + final Summary[] summaries = bean.getSummary(details, query); + + for (int i = 0; i < length; i++) + { + System.out.println( " " + + i + + " " + + java.util.Arrays.asList( details[i].getValues( "segment" ) ) + + " " + + java.util.Arrays.asList( details[i].getValues( "url" ) ) + + " " + + java.util.Arrays.asList( details[i].getValues( "digest" ) ) + + " " + + java.util.Arrays.asList( details[i].getValues( "date" ) ) + + " " + + java.util.Arrays.asList( details[i].getValues( "title" ) ) + + "\n" + + summaries[i] ); + } + } catch (Throwable t) { + LOG.error("Exception occured while executing search: " + t, t); + System.exit(1); + } + System.exit(0); + } + + public long getProtocolVersion(String className, long clientVersion) + throws IOException { + if(RPCSearchBean.class.getName().equals(className) && + searchBean instanceof RPCSearchBean) { + + final RPCSearchBean rpcBean = (RPCSearchBean)searchBean; + return rpcBean.getProtocolVersion(className, clientVersion); + } else if (RPCSegmentBean.class.getName().equals(className) && + segmentBean instanceof RPCSegmentBean) { + + final RPCSegmentBean rpcBean = (RPCSegmentBean)segmentBean; + return rpcBean.getProtocolVersion(className, clientVersion); + } else { + throw new IOException("Unknown Protocol classname:" + className); + } + } + + /** Responsible for constructing a NutchBean singleton instance and + * caching it in the servlet context. This class should be registered in + * the deployment descriptor as a listener + */ + public static class NutchBeanConstructor implements ServletContextListener { + + public void contextDestroyed(ServletContextEvent sce) { } + + public void contextInitialized(ServletContextEvent sce) { + final ServletContext app = sce.getServletContext(); + final Configuration conf = NutchConfiguration.get(app); + + LOG.info("creating new bean"); + NutchBean bean = null; + try { + bean = new NutchBean(conf); + app.setAttribute(KEY, bean); + } + catch (final IOException ex) { + LOG.error(StringUtils.stringifyException(ex)); + } + } + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 22:06:53
|
Revision: 2864 http://archive-access.svn.sourceforge.net/archive-access/?rev=2864&view=rev Author: binzino Date: 2009-10-28 22:06:07 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Add constructor which takes IndexReader. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2009-10-28 22:03:37 UTC (rev 2863) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2009-10-28 22:06:07 UTC (rev 2864) @@ -71,6 +71,14 @@ init(IndexReader.open(getDirectory(index)), conf); } + public IndexSearcher( IndexReader reader, Configuration conf ) + throws IOException + { + this.conf = conf; + this.fs = FileSystem.get( conf ); + init( reader, conf ); + } + private void init(IndexReader reader, Configuration conf) throws IOException { this.reader = reader; this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 22:04:01
|
Revision: 2863 http://archive-access.svn.sourceforge.net/archive-access/?rev=2863&view=rev Author: binzino Date: 2009-10-28 22:03:37 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Removed as all NutchWAX mods/edits have been moved into NutchBean in the Nutch source overlay. Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2009-10-28 21:55:11 UTC (rev 2862) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2009-10-28 22:03:37 UTC (rev 2863) @@ -1,296 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.nutchwax; - -//import java.io.*; -import java.util.*; -import java.lang.reflect.Field; -import javax.servlet.ServletContext; -import javax.servlet.ServletContextEvent; -import javax.servlet.ServletContextListener; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import org.apache.nutch.searcher.NutchBean; -import org.apache.nutch.searcher.IndexSearcher; -import org.apache.nutch.searcher.Query; -import org.apache.nutch.searcher.HitDetails; -import org.apache.nutch.searcher.Hit; -import org.apache.nutch.searcher.Hits; -import org.apache.nutch.searcher.Summary; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.conf.Configuration; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.ArchiveParallelReader; -import org.apache.lucene.index.MultiReader; - -import org.apache.nutch.util.HadoopFSUtil; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.indexer.FsDirectory; - -/** - * Utility class to use and extend the NutchBean class for reading - * from parallel indices. - * - * This can be used from the command-line to run test/debug searches, - * the same as NutchBean, but using parallel indices. - * - * NutchWaxBean doesn't extend NutchBean directly since all the good - * stuff inside of NutchBean is declared private. So, we dynamically - * modify a NutchBean instance via reflection to inject our own - * IndexReader that reads from a set of parallel indices. - * - * Before you recoil in horror over this approach, the alternatives - * were none too pretty. Sub-classing won't work since all the - * NutchBean data members are declared private. We could copy the - * NutchBean.java into our own source base and effectively over-write - * the Nutch version when we compile, but that is a maintenance - * headache of extreme magnitude. Plus, we'd probably have to - * copy/past/edit multiple Java source files. - * - * Ideally, Nutch would use some sort of dependency injection system, - * or at least make the NutchBean data members have public get/set - * methods (like a bean should). For now, doing dynamic injection via - * reflection seemed the least obtrusive. - */ -public class NutchWaxBean -{ - public static final Log LOG = LogFactory.getLog( NutchWaxBean.class ); - - /** - * Static utility class for modifying a NutchBean instance. - */ - public static class NutchBeanModifier - { - /** - * Modify the NutchBean by replacing the IndexReader in its - * IndexSearcher with one we create that uses - * ArchiveParallelReader for searching across parallel indices. - */ - public static boolean modify( NutchBean bean ) - { - try - { - LOG.info( "Modifying NutchBean with NutchWAX extensions..." ); - - // First, get the configuration from the bean. Gosh it would be - // nice if NutchBean had a getConf() public method, wouldn't it? - Field fConf = NutchBean.class.getDeclaredField( "conf" ); - fConf.setAccessible( true ); - - // The rest of this code is similar to NutchBean in that it - // looks for a 'pindexes' directory as a sibling of the - // 'indexes' directory that NutchBean finds. - Configuration conf = (Configuration) fConf.get( bean ); - - FileSystem fs = FileSystem.get( conf ); - - Path dir = new Path( conf.get( "searcher.dir", "crawl") ).makeQualified( fs ); - LOG.info( "Looking for Nutch indexes in: " + dir ); - if ( ! fs.exists( dir ) ) - { - LOG.warn( "Directory does not exist: " + dir ); - LOG.warn( "NutchBean not modified." ); - LOG.warn( "No Nutch indexes will be found and all queries will return no results." ); - - return false; - } - - Path indexesDir = new Path( dir, "pindexes" ).makeQualified(fs); - LOG.info( "Looking for NutchWax parallel indexes in: " + indexesDir ); - if ( ! fs.exists( indexesDir ) ) - { - LOG.warn( "Parallel indexes directory does not exist: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - if ( ! fs.getFileStatus( indexesDir ).isDir( ) ) - { - LOG.warn( "Parallel indexes directory is not a directory: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - FileStatus[] fstats = fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); - Path[] indexDirs = HadoopFSUtil.getPaths( fstats ); - - if ( indexDirs.length < 1 ) - { - LOG.info( "No sub-dirs found in parallel indexes directory: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length ); - - for ( Path indexDir : indexDirs ) - { - fstats = fs.listStatus( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ); - Path parallelDirs[] = HadoopFSUtil.getPaths( fstats ); - - if ( parallelDirs.length < 1 ) - { - LOG.info( "No sub-directories, skipping: " + indexDir ); - - continue; - } - - ArchiveParallelReader reader = new ArchiveParallelReader( ); - - // Sort the parallelDirs so that we add them in order. Order - // matters to the ParallelReader. - Arrays.sort( parallelDirs ); - - for ( Path p : parallelDirs ) - { - LOG.info( "Adding reader for: " + p ); - reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); - } - - readers.add( reader ); - } - - if ( readers.size( ) == 0 ) - { - LOG.warn( "No parallel indexes in: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) ); - - // Now, inject the 'reader' into the NutchBean's IndexSearcher via reflection. - Field fSearcher = NutchBean.class.getDeclaredField( "searcher" ); - Field fReader = IndexSearcher.class.getDeclaredField( "reader" ); - Field fLuceneSearcher = IndexSearcher.class.getDeclaredField( "luceneSearcher" ); - - fSearcher .setAccessible( true ); - fReader .setAccessible( true ); - fLuceneSearcher.setAccessible( true ); - - org.apache.lucene.search.IndexSearcher newLuceneSearcher = new org.apache.lucene.search.IndexSearcher( reader ); - - IndexSearcher searcher = (IndexSearcher) fSearcher.get( bean ); - fLuceneSearcher.set( searcher, newLuceneSearcher ); - fReader .set( searcher, reader ); - - return true; - } - catch ( Exception e ) - { - throw new RuntimeException( e ); - } - } - } - - /** - * Similar to code in NutchBean. This receives the events from the - * servlet container and modifies the NutchBean instance put there - * by the NutchBeanConstructor listener. For this to work, it must - * be declared after the NutchBeanConstructor in the web.xml file, - * e.g. - * <pre> - * <listener> - * <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class> - * <listener-class>org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor</listener-class> - * </listener> - * </pre> - */ - public static class NutchWaxBeanConstructor implements ServletContextListener - { - - public void contextDestroyed( ServletContextEvent sce ) - { - } - - public void contextInitialized( ServletContextEvent sce ) - { - ServletContext app = sce.getServletContext(); - NutchBean bean = (NutchBean) app.getAttribute( NutchBean.KEY ); - - if ( bean == null ) - { - LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" ); - - return ; - } - - // Modify the NutchBean. - NutchBeanModifier.modify( bean ); - } - - } - - /** - * Simple command-line driver akin to NutchBean.main that peforms - * the ben modification. Useful for testing and debugging from the - * command-line. - */ - public static void main(String[] args) throws Exception - { - String usage = "NutchWaxBean query"; - - if (args.length == 0) - { - System.err.println(usage); - System.exit(-1); - } - - Configuration conf = NutchConfiguration.create(); - - NutchBean bean = new NutchBean(conf); - NutchBeanModifier.modify( bean ); - - Query query = Query.parse(args[0], conf); - Hits hits = bean.search(query, 10); - System.out.println("Total hits: " + hits.getTotal()); - int length = (int)Math.min(hits.getTotal(), 10); - Hit[] show = hits.getHits(0, length); - HitDetails[] details = bean.getDetails(show); - Summary[] summaries = bean.getSummary(details, query); - - for (int i = 0; i < hits.getLength(); i++) - { - // Use a slightly more verbose output than NutchBean. - System.out.println( " " - + i - + " " - + java.util.Arrays.asList( details[i].getValues( "segment" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "url" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "orig" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "digest" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "date" ) ) - + "\n" - + summaries[i] ); - } - } - - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 21:55:33
|
Revision: 2862 http://archive-access.svn.sourceforge.net/archive-access/?rev=2862&view=rev Author: binzino Date: 2009-10-28 21:55:11 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Removed NutchWAX scoring filter since we now recommend to do the scoring/boosting after the index is built. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-28 21:49:56 UTC (rev 2861) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-10-28 21:55:11 UTC (rev 2862) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|urlfilter-nutchwax</value> </property> <!-- This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 21:50:40
|
Revision: 2861 http://archive-access.svn.sourceforge.net/archive-access/?rev=2861&view=rev Author: binzino Date: 2009-10-28 21:49:56 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Need to explicitly set the score, otherwise it will remain the default value of 0, which is then put into the Lucene index as the 'boost' value. And a boost value of 0 makes the documents never show up in a search. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2009-10-28 21:24:26 UTC (rev 2860) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/IndexerMapReduce.java 2009-10-28 21:49:56 UTC (rev 2861) @@ -113,6 +113,8 @@ // skip documents discarded by indexing filters if (doc == null) return; + doc.setScore( 1.0f ); + output.collect(key, doc); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 21:24:58
|
Revision: 2860 http://archive-access.svn.sourceforge.net/archive-access/?rev=2860&view=rev Author: binzino Date: 2009-10-28 21:24:26 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Moved to Nutch source overlay so that edits in Nutch sources can access this class. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-28 03:40:14 UTC (rev 2859) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-28 21:24:26 UTC (rev 2860) @@ -1,616 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * ARCHIVE: This must be in the lucene index package because it needs - * to call protected methods on other IndexReader objects. - */ -package org.apache.lucene.index; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.FieldSelector; -import org.apache.lucene.document.FieldSelectorResult; -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.TermFreqVector; -import org.apache.lucene.index.TermPositions; -import org.apache.lucene.index.TermVectorMapper; - -import java.io.IOException; -import java.util.*; - - -/** An IndexReader which reads multiple, parallel indexes. Each index added - * must have the same number of documents, but typically each contains - * different fields. Each document contains the union of the fields of all - * documents with the same document number. When searching, matches for a - * query term are from the first index added that has the field. - * - * <p>This is useful, e.g., with collections that have large fields which - * change rarely and small fields that change more frequently. The smaller - * fields may be re-indexed in a new index and both indexes may be searched - * together. - * - * <p><strong>Warning:</strong> It is up to you to make sure all indexes - * are created and modified the same way. For example, if you add - * documents to one index, you need to add the same documents in the - * same order to the other indexes. <em>Failure to do so will result in - * undefined behavior</em>. - */ -public class ArchiveParallelReader extends IndexReader { - private List readers = new ArrayList(); - private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close - boolean incRefReaders = false; - private SortedMap fieldToReader = new TreeMap(); - - private int maxDoc; - private int numDocs; - private boolean hasDeletions; - - /** Construct a ArchiveParallelReader. - * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p> - */ - public ArchiveParallelReader() throws IOException { this(true); } - - /** Construct a ArchiveParallelReader. - * @param closeSubReaders indicates whether the subreaders should be closed - * when this ArchiveParallelReader is closed - */ - public ArchiveParallelReader(boolean closeSubReaders) throws IOException { - super(); - this.incRefReaders = !closeSubReaders; - } - - /** Add an IndexReader. - * @throws IOException if there is a low-level IO error - */ - public void add(IndexReader reader) throws IOException - { - ensureOpen(); - if (readers.size() == 0) { - this.maxDoc = reader.maxDoc(); - this.numDocs = reader.numDocs(); - this.hasDeletions = reader.hasDeletions(); - } - - if (reader.maxDoc() != maxDoc) // check compatibility - throw new IllegalArgumentException - ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); - if (reader.numDocs() != numDocs) - throw new IllegalArgumentException - ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); - - Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); - Iterator i = fields.iterator(); - while (i.hasNext()) { // update fieldToReader map - String field = (String)i.next(); - if (fieldToReader.get(field) == null) - fieldToReader.put(field, reader); - } - - readers.add(reader); - - if (incRefReaders) { - reader.incRef(); - } - decrefOnClose.add(Boolean.valueOf(incRefReaders)); - } - - /** - * Tries to reopen the subreaders. - * <br> - * If one or more subreaders could be re-opened (i. e. subReader.reopen() - * returned a new instance != subReader), then a new ArchiveParallelReader instance - * is returned, otherwise this instance is returned. - * <p> - * A re-opened instance might share one or more subreaders with the old - * instance. Index modification operations result in undefined behavior - * when performed before the old instance is closed. - * (see {@link IndexReader#reopen()}). - * <p> - * If subreaders are shared, then the reference count of those - * readers is increased to ensure that the subreaders remain open - * until the last referring reader is closed. - * - * @throws CorruptIndexException if the index is corrupt - * @throws IOException if there is a low-level IO error - */ - public IndexReader reopen() throws CorruptIndexException, IOException { - ensureOpen(); - - boolean reopened = false; - List newReaders = new ArrayList(); - List newDecrefOnClose = new ArrayList(); - - boolean success = false; - - try { - - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = oldReader.reopen(); - newReaders.add(newReader); - // if at least one of the subreaders was updated we remember that - // and return a new MultiReader - if (newReader != oldReader) { - reopened = true; - } - } - - if (reopened) { - ArchiveParallelReader pr = new ArchiveParallelReader(); - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = (IndexReader) newReaders.get(i); - if (newReader == oldReader) { - newDecrefOnClose.add(Boolean.TRUE); - newReader.incRef(); - } else { - // this is a new subreader instance, so on close() we don't - // decRef but close it - newDecrefOnClose.add(Boolean.FALSE); - } - pr.add(newReader); - } - pr.decrefOnClose = newDecrefOnClose; - pr.incRefReaders = incRefReaders; - success = true; - return pr; - } else { - success = true; - // No subreader was refreshed - return this; - } - } finally { - if (!success && reopened) { - for (int i = 0; i < newReaders.size(); i++) { - IndexReader r = (IndexReader) newReaders.get(i); - if (r != null) { - try { - if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { - r.decRef(); - } else { - r.close(); - } - } catch (IOException ignore) { - // keep going - we want to clean up as much as possible - } - } - } - } - } - } - - - public int numDocs() { - // Don't call ensureOpen() here (it could affect performance) - return numDocs; - } - - public int maxDoc() { - // Don't call ensureOpen() here (it could affect performance) - return maxDoc; - } - - public boolean hasDeletions() { - // Don't call ensureOpen() here (it could affect performance) - return hasDeletions; - } - - // check first reader - public boolean isDeleted(int n) { - // Don't call ensureOpen() here (it could affect performance) - if (readers.size() > 0) - return ((IndexReader)readers.get(0)).isDeleted(n); - return false; - } - - // delete in all readers - protected void doDelete(int n) throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).deleteDocument(n); - } - hasDeletions = true; - } - - /** - * @see org.apache.lucene.index.ParallelReader.doUndeleteAll - */ - protected void doUndeleteAll() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).undeleteAll(); - } - hasDeletions = false; - } - - /** - * <p><strong>ARCHIVE</strong> modification</p> - * <p>Return a <code>Document</code> with fields merged from parallel - * indices. The values for a given field will <strong>only</strong> - * come from the first index that has the field. This matches the - * searching behavior where a field is only searched in the first - * index that has the field.</p> - * <p>This differs from the bundled Lucene <code>ParallelReader</code>, - * which adds all values from every index that has the field.</p> - * <p>The <code>fieldSelector<code> parameter is ignored.</p> - * <h3>Implementation Notes</h3> - * <p>Since getting the document from the reader is the expensive - * operation, we only get it once from each reader. Once we've - * gotten the document from the reader, we iterate through the - * fields and only copy those fields that are mapped to the reader.</p> - * <p>The first implementation iterated through the field names, - * getting the document from the corresponding reader for each - * field name (10 fields => 10 document gets) which was a big - * performance hit.</p> - * <p>In this implementation, there are only as many document gets as - * there are readers.</p> - * @param n ordinal position of document to return - * @param fieldSelector ignored - * @return the document with field values assembled from parallel indicdes - * @throws CorruptIndexException if the index is corrupt - * @throws IOException if there is a low-level IO error - */ - public Document document(int n, FieldSelector fieldSelector) - throws CorruptIndexException, IOException - { - ensureOpen(); - Document result = new Document(); - - for ( IndexReader reader : (List<IndexReader>) readers ) - { - Document d = reader.document( n ); - - for ( Fieldable f : ((List<Fieldable>) d.getFields()) ) - { - if ( fieldToReader.get( f.name( ) ) == reader ) - { - result.add( f ); - } - } - } - - return result; - } - - // get all vectors - public TermFreqVector[] getTermFreqVectors(int n) throws IOException { - ensureOpen(); - ArrayList results = new ArrayList(); - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); - TermFreqVector vector = reader.getTermFreqVector(n, field); - if (vector != null) - results.add(vector); - } - return (TermFreqVector[]) - results.toArray(new TermFreqVector[results.size()]); - } - - public TermFreqVector getTermFreqVector(int n, String field) - throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - return reader==null ? null : reader.getTermFreqVector(n, field); - } - - - public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader != null) { - reader.getTermFreqVector(docNumber, field, mapper); - } - } - - public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { - ensureOpen(); - ensureOpen(); - - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); - reader.getTermFreqVector(docNumber, field, mapper); - } - - } - - public boolean hasNorms(String field) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - return reader==null ? false : reader.hasNorms(field); - } - - public byte[] norms(String field) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - return reader==null ? null : reader.norms(field); - } - - public void norms(String field, byte[] result, int offset) - throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader!=null) - reader.norms(field, result, offset); - } - - protected void doSetNorm(int n, String field, byte value) - throws CorruptIndexException, IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader!=null) - reader.doSetNorm(n, field, value); - } - - public TermEnum terms() throws IOException { - ensureOpen(); - return new ParallelTermEnum(); - } - - public TermEnum terms(Term term) throws IOException { - ensureOpen(); - return new ParallelTermEnum(term); - } - - public int docFreq(Term term) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); - return reader==null ? 0 : reader.docFreq(term); - } - - public TermDocs termDocs(Term term) throws IOException { - ensureOpen(); - return new ParallelTermDocs(term); - } - - public TermDocs termDocs() throws IOException { - ensureOpen(); - return new ParallelTermDocs(); - } - - public TermPositions termPositions(Term term) throws IOException { - ensureOpen(); - return new ParallelTermPositions(term); - } - - public TermPositions termPositions() throws IOException { - ensureOpen(); - return new ParallelTermPositions(); - } - - /** - * Checks recursively if all subreaders are up to date. - */ - public boolean isCurrent() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isCurrent()) { - return false; - } - } - - // all subreaders are up to date - return true; - } - - /** - * Checks recursively if all subindexes are optimized - */ - public boolean isOptimized() { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isOptimized()) { - return false; - } - } - - // all subindexes are optimized - return true; - } - - - /** Not implemented. - * @throws UnsupportedOperationException - */ - public long getVersion() { - throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); - } - - // for testing - IndexReader[] getSubReaders() { - return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); - } - - protected void doCommit() throws IOException { - for (int i = 0; i < readers.size(); i++) - ((IndexReader)readers.get(i)).commit(); - } - - protected synchronized void doClose() throws IOException { - for (int i = 0; i < readers.size(); i++) { - if (((Boolean) decrefOnClose.get(i)).booleanValue()) { - ((IndexReader)readers.get(i)).decRef(); - } else { - ((IndexReader)readers.get(i)).close(); - } - } - } - - public Collection getFieldNames (IndexReader.FieldOption fieldNames) { - ensureOpen(); - Set fieldSet = new HashSet(); - for (int i = 0; i < readers.size(); i++) { - IndexReader reader = ((IndexReader)readers.get(i)); - Collection names = reader.getFieldNames(fieldNames); - fieldSet.addAll(names); - } - return fieldSet; - } - - private class ParallelTermEnum extends TermEnum { - private String field; - private Iterator fieldIterator; - private TermEnum termEnum; - - public ParallelTermEnum() throws IOException { - if ( fieldToReader.isEmpty( ) ) return ; - - field = (String)fieldToReader.firstKey(); - if (field != null) - termEnum = ((IndexReader)fieldToReader.get(field)).terms(); - } - - public ParallelTermEnum(Term term) throws IOException { - field = term.field(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader!=null) - termEnum = reader.terms(term); - } - - public boolean next() throws IOException { - if (termEnum==null) - return false; - - // another term in this field? - if (termEnum.next() && termEnum.term().field()==field) - return true; // yes, keep going - - termEnum.close(); // close old termEnum - - // find the next field with terms, if any - if (fieldIterator==null) { - fieldIterator = fieldToReader.tailMap(field).keySet().iterator(); - fieldIterator.next(); // Skip field to get next one - } - while (fieldIterator.hasNext()) { - field = (String) fieldIterator.next(); - termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); - Term term = termEnum.term(); - if (term!=null && term.field()==field) - return true; - else - termEnum.close(); - } - - return false; // no more fields - } - - public Term term() { - if (termEnum==null) - return null; - - return termEnum.term(); - } - - public int docFreq() { - if (termEnum==null) - return 0; - - return termEnum.docFreq(); - } - - public void close() throws IOException { - if (termEnum!=null) - termEnum.close(); - } - - } - - // wrap a TermDocs in order to support seek(Term) - private class ParallelTermDocs implements TermDocs { - protected TermDocs termDocs; - - public ParallelTermDocs() {} - public ParallelTermDocs(Term term) throws IOException { seek(term); } - - public int doc() { return termDocs.doc(); } - public int freq() { return termDocs.freq(); } - - public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); - termDocs = reader!=null ? reader.termDocs(term) : null; - } - - public void seek(TermEnum termEnum) throws IOException { - seek(termEnum.term()); - } - - public boolean next() throws IOException { - if (termDocs==null) - return false; - - return termDocs.next(); - } - - public int read(final int[] docs, final int[] freqs) throws IOException { - if (termDocs==null) - return 0; - - return termDocs.read(docs, freqs); - } - - public boolean skipTo(int target) throws IOException { - if (termDocs==null) - return false; - - return termDocs.skipTo(target); - } - - public void close() throws IOException { - if (termDocs!=null) - termDocs.close(); - } - - } - - private class ParallelTermPositions - extends ParallelTermDocs implements TermPositions { - - public ParallelTermPositions() {} - public ParallelTermPositions(Term term) throws IOException { seek(term); } - - public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); - termDocs = reader!=null ? reader.termPositions(term) : null; - } - - public int nextPosition() throws IOException { - // It is an error to call this if there is no next position, e.g. if termDocs==null - return ((TermPositions)termDocs).nextPosition(); - } - - public int getPayloadLength() { - return ((TermPositions)termDocs).getPayloadLength(); - } - - public byte[] getPayload(byte[] data, int offset) throws IOException { - return ((TermPositions)termDocs).getPayload(data, offset); - } - - - // TODO: Remove warning after API has been finalized - public boolean isPayloadAvailable() { - return ((TermPositions) termDocs).isPayloadAvailable(); - } - } - -} Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-28 21:24:26 UTC (rev 2860) @@ -0,0 +1,616 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * ARCHIVE: This must be in the lucene index package because it needs + * to call protected methods on other IndexReader objects. + */ +package org.apache.lucene.index; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.TermVectorMapper; + +import java.io.IOException; +import java.util.*; + + +/** An IndexReader which reads multiple, parallel indexes. Each index added + * must have the same number of documents, but typically each contains + * different fields. Each document contains the union of the fields of all + * documents with the same document number. When searching, matches for a + * query term are from the first index added that has the field. + * + * <p>This is useful, e.g., with collections that have large fields which + * change rarely and small fields that change more frequently. The smaller + * fields may be re-indexed in a new index and both indexes may be searched + * together. + * + * <p><strong>Warning:</strong> It is up to you to make sure all indexes + * are created and modified the same way. For example, if you add + * documents to one index, you need to add the same documents in the + * same order to the other indexes. <em>Failure to do so will result in + * undefined behavior</em>. + */ +public class ArchiveParallelReader extends IndexReader { + private List readers = new ArrayList(); + private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close + boolean incRefReaders = false; + private SortedMap fieldToReader = new TreeMap(); + + private int maxDoc; + private int numDocs; + private boolean hasDeletions; + + /** Construct a ArchiveParallelReader. + * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p> + */ + public ArchiveParallelReader() throws IOException { this(true); } + + /** Construct a ArchiveParallelReader. + * @param closeSubReaders indicates whether the subreaders should be closed + * when this ArchiveParallelReader is closed + */ + public ArchiveParallelReader(boolean closeSubReaders) throws IOException { + super(); + this.incRefReaders = !closeSubReaders; + } + + /** Add an IndexReader. + * @throws IOException if there is a low-level IO error + */ + public void add(IndexReader reader) throws IOException + { + ensureOpen(); + if (readers.size() == 0) { + this.maxDoc = reader.maxDoc(); + this.numDocs = reader.numDocs(); + this.hasDeletions = reader.hasDeletions(); + } + + if (reader.maxDoc() != maxDoc) // check compatibility + throw new IllegalArgumentException + ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); + if (reader.numDocs() != numDocs) + throw new IllegalArgumentException + ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); + + Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); + Iterator i = fields.iterator(); + while (i.hasNext()) { // update fieldToReader map + String field = (String)i.next(); + if (fieldToReader.get(field) == null) + fieldToReader.put(field, reader); + } + + readers.add(reader); + + if (incRefReaders) { + reader.incRef(); + } + decrefOnClose.add(Boolean.valueOf(incRefReaders)); + } + + /** + * Tries to reopen the subreaders. + * <br> + * If one or more subreaders could be re-opened (i. e. subReader.reopen() + * returned a new instance != subReader), then a new ArchiveParallelReader instance + * is returned, otherwise this instance is returned. + * <p> + * A re-opened instance might share one or more subreaders with the old + * instance. Index modification operations result in undefined behavior + * when performed before the old instance is closed. + * (see {@link IndexReader#reopen()}). + * <p> + * If subreaders are shared, then the reference count of those + * readers is increased to ensure that the subreaders remain open + * until the last referring reader is closed. + * + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public IndexReader reopen() throws CorruptIndexException, IOException { + ensureOpen(); + + boolean reopened = false; + List newReaders = new ArrayList(); + List newDecrefOnClose = new ArrayList(); + + boolean success = false; + + try { + + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = oldReader.reopen(); + newReaders.add(newReader); + // if at least one of the subreaders was updated we remember that + // and return a new MultiReader + if (newReader != oldReader) { + reopened = true; + } + } + + if (reopened) { + ArchiveParallelReader pr = new ArchiveParallelReader(); + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = (IndexReader) newReaders.get(i); + if (newReader == oldReader) { + newDecrefOnClose.add(Boolean.TRUE); + newReader.incRef(); + } else { + // this is a new subreader instance, so on close() we don't + // decRef but close it + newDecrefOnClose.add(Boolean.FALSE); + } + pr.add(newReader); + } + pr.decrefOnClose = newDecrefOnClose; + pr.incRefReaders = incRefReaders; + success = true; + return pr; + } else { + success = true; + // No subreader was refreshed + return this; + } + } finally { + if (!success && reopened) { + for (int i = 0; i < newReaders.size(); i++) { + IndexReader r = (IndexReader) newReaders.get(i); + if (r != null) { + try { + if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { + r.decRef(); + } else { + r.close(); + } + } catch (IOException ignore) { + // keep going - we want to clean up as much as possible + } + } + } + } + } + } + + + public int numDocs() { + // Don't call ensureOpen() here (it could affect performance) + return numDocs; + } + + public int maxDoc() { + // Don't call ensureOpen() here (it could affect performance) + return maxDoc; + } + + public boolean hasDeletions() { + // Don't call ensureOpen() here (it could affect performance) + return hasDeletions; + } + + // check first reader + public boolean isDeleted(int n) { + // Don't call ensureOpen() here (it could affect performance) + if (readers.size() > 0) + return ((IndexReader)readers.get(0)).isDeleted(n); + return false; + } + + // delete in all readers + protected void doDelete(int n) throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).deleteDocument(n); + } + hasDeletions = true; + } + + /** + * @see org.apache.lucene.index.ParallelReader.doUndeleteAll + */ + protected void doUndeleteAll() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).undeleteAll(); + } + hasDeletions = false; + } + + /** + * <p><strong>ARCHIVE</strong> modification</p> + * <p>Return a <code>Document</code> with fields merged from parallel + * indices. The values for a given field will <strong>only</strong> + * come from the first index that has the field. This matches the + * searching behavior where a field is only searched in the first + * index that has the field.</p> + * <p>This differs from the bundled Lucene <code>ParallelReader</code>, + * which adds all values from every index that has the field.</p> + * <p>The <code>fieldSelector<code> parameter is ignored.</p> + * <h3>Implementation Notes</h3> + * <p>Since getting the document from the reader is the expensive + * operation, we only get it once from each reader. Once we've + * gotten the document from the reader, we iterate through the + * fields and only copy those fields that are mapped to the reader.</p> + * <p>The first implementation iterated through the field names, + * getting the document from the corresponding reader for each + * field name (10 fields => 10 document gets) which was a big + * performance hit.</p> + * <p>In this implementation, there are only as many document gets as + * there are readers.</p> + * @param n ordinal position of document to return + * @param fieldSelector ignored + * @return the document with field values assembled from parallel indicdes + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public Document document(int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException + { + ensureOpen(); + Document result = new Document(); + + for ( IndexReader reader : (List<IndexReader>) readers ) + { + Document d = reader.document( n ); + + for ( Fieldable f : ((List<Fieldable>) d.getFields()) ) + { + if ( fieldToReader.get( f.name( ) ) == reader ) + { + result.add( f ); + } + } + } + + return result; + } + + // get all vectors + public TermFreqVector[] getTermFreqVectors(int n) throws IOException { + ensureOpen(); + ArrayList results = new ArrayList(); + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + TermFreqVector vector = reader.getTermFreqVector(n, field); + if (vector != null) + results.add(vector); + } + return (TermFreqVector[]) + results.toArray(new TermFreqVector[results.size()]); + } + + public TermFreqVector getTermFreqVector(int n, String field) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.getTermFreqVector(n, field); + } + + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader != null) { + reader.getTermFreqVector(docNumber, field, mapper); + } + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + ensureOpen(); + + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + reader.getTermFreqVector(docNumber, field, mapper); + } + + } + + public boolean hasNorms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? false : reader.hasNorms(field); + } + + public byte[] norms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.norms(field); + } + + public void norms(String field, byte[] result, int offset) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.norms(field, result, offset); + } + + protected void doSetNorm(int n, String field, byte value) + throws CorruptIndexException, IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.doSetNorm(n, field, value); + } + + public TermEnum terms() throws IOException { + ensureOpen(); + return new ParallelTermEnum(); + } + + public TermEnum terms(Term term) throws IOException { + ensureOpen(); + return new ParallelTermEnum(term); + } + + public int docFreq(Term term) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + return reader==null ? 0 : reader.docFreq(term); + } + + public TermDocs termDocs(Term term) throws IOException { + ensureOpen(); + return new ParallelTermDocs(term); + } + + public TermDocs termDocs() throws IOException { + ensureOpen(); + return new ParallelTermDocs(); + } + + public TermPositions termPositions(Term term) throws IOException { + ensureOpen(); + return new ParallelTermPositions(term); + } + + public TermPositions termPositions() throws IOException { + ensureOpen(); + return new ParallelTermPositions(); + } + + /** + * Checks recursively if all subreaders are up to date. + */ + public boolean isCurrent() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isCurrent()) { + return false; + } + } + + // all subreaders are up to date + return true; + } + + /** + * Checks recursively if all subindexes are optimized + */ + public boolean isOptimized() { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isOptimized()) { + return false; + } + } + + // all subindexes are optimized + return true; + } + + + /** Not implemented. + * @throws UnsupportedOperationException + */ + public long getVersion() { + throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); + } + + // for testing + IndexReader[] getSubReaders() { + return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); + } + + protected void doCommit() throws IOException { + for (int i = 0; i < readers.size(); i++) + ((IndexReader)readers.get(i)).commit(); + } + + protected synchronized void doClose() throws IOException { + for (int i = 0; i < readers.size(); i++) { + if (((Boolean) decrefOnClose.get(i)).booleanValue()) { + ((IndexReader)readers.get(i)).decRef(); + } else { + ((IndexReader)readers.get(i)).close(); + } + } + } + + public Collection getFieldNames (IndexReader.FieldOption fieldNames) { + ensureOpen(); + Set fieldSet = new HashSet(); + for (int i = 0; i < readers.size(); i++) { + IndexReader reader = ((IndexReader)readers.get(i)); + Collection names = reader.getFieldNames(fieldNames); + fieldSet.addAll(names); + } + return fieldSet; + } + + private class ParallelTermEnum extends TermEnum { + private String field; + private Iterator fieldIterator; + private TermEnum termEnum; + + public ParallelTermEnum() throws IOException { + if ( fieldToReader.isEmpty( ) ) return ; + + field = (String)fieldToReader.firstKey(); + if (field != null) + termEnum = ((IndexReader)fieldToReader.get(field)).terms(); + } + + public ParallelTermEnum(Term term) throws IOException { + field = term.field(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + termEnum = reader.terms(term); + } + + public boolean next() throws IOException { + if (termEnum==null) + return false; + + // another term in this field? + if (termEnum.next() && termEnum.term().field()==field) + return true; // yes, keep going + + termEnum.close(); // close old termEnum + + // find the next field with terms, if any + if (fieldIterator==null) { + fieldIterator = fieldToReader.tailMap(field).keySet().iterator(); + fieldIterator.next(); // Skip field to get next one + } + while (fieldIterator.hasNext()) { + field = (String) fieldIterator.next(); + termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); + Term term = termEnum.term(); + if (term!=null && term.field()==field) + return true; + else + termEnum.close(); + } + + return false; // no more fields + } + + public Term term() { + if (termEnum==null) + return null; + + return termEnum.term(); + } + + public int docFreq() { + if (termEnum==null) + return 0; + + return termEnum.docFreq(); + } + + public void close() throws IOException { + if (termEnum!=null) + termEnum.close(); + } + + } + + // wrap a TermDocs in order to support seek(Term) + private class ParallelTermDocs implements TermDocs { + protected TermDocs termDocs; + + public ParallelTermDocs() {} + public ParallelTermDocs(Term term) throws IOException { seek(term); } + + public int doc() { return termDocs.doc(); } + public int freq() { return termDocs.freq(); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termDocs(term) : null; + } + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean next() throws IOException { + if (termDocs==null) + return false; + + return termDocs.next(); + } + + public int read(final int[] docs, final int[] freqs) throws IOException { + if (termDocs==null) + return 0; + + return termDocs.read(docs, freqs); + } + + public boolean skipTo(int target) throws IOException { + if (termDocs==null) + return false; + + return termDocs.skipTo(target); + } + + public void close() throws IOException { + if (termDocs!=null) + termDocs.close(); + } + + } + + private class ParallelTermPositions + extends ParallelTermDocs implements TermPositions { + + public ParallelTermPositions() {} + public ParallelTermPositions(Term term) throws IOException { seek(term); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termPositions(term) : null; + } + + public int nextPosition() throws IOException { + // It is an error to call this if there is no next position, e.g. if termDocs==null + return ((TermPositions)termDocs).nextPosition(); + } + + public int getPayloadLength() { + return ((TermPositions)termDocs).getPayloadLength(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return ((TermPositions)termDocs).getPayload(data, offset); + } + + + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return ((TermPositions) termDocs).isPayloadAvailable(); + } + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-28 03:40:28
|
Revision: 2859 http://archive-access.svn.sourceforge.net/archive-access/?rev=2859&view=rev Author: bradtofel Date: 2009-10-28 03:40:14 +0000 (Wed, 28 Oct 2009) Log Message: ----------- FEATURE: added exactHostMatch boolean member, which causes only results matching the exact host request to be returned. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2009-10-28 03:38:26 UTC (rev 2858) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2009-10-28 03:40:14 UTC (rev 2859) @@ -25,6 +25,7 @@ package org.archive.wayback.webapp; import java.io.IOException; +import java.util.List; import java.util.Locale; import java.util.Properties; import java.util.logging.Logger; @@ -82,6 +83,7 @@ private boolean useServerName = false; private boolean useAnchorWindow = false; private boolean exactSchemeMatch = true; + private boolean exactHostMatch = false; private int contextPort = 0; private String contextName = null; @@ -97,7 +99,25 @@ private BooleanOperator<WaybackRequest> authentication = null; private String urlRoot = null; private Locale locale = null; + private List<String> filePatterns = null; + private List<String> filePrefixes = null; + + public List<String> getFilePatterns() { + return filePatterns; + } + public void setFilePatterns(List<String> filePatterns) { + this.filePatterns = filePatterns; + } + + public List<String> getFilePrefixes() { + return filePrefixes; + } + + public void setFilePrefixes(List<String> filePrefixes) { + this.filePrefixes = filePrefixes; + } + /** * @return the contextName */ @@ -346,6 +366,7 @@ } else { + wbRequest.setExactHost(exactHostMatch); handleQuery(wbRequest,httpRequest,httpResponse); } } else { @@ -575,4 +596,18 @@ public void setUrlRoot(String urlRoot) { this.urlRoot = urlRoot; } + + /** + * @return the exactHostMatch + */ + public boolean isExactHostMatch() { + return exactHostMatch; + } + + /** + * @param exactHostMatch the exactHostMatch to set + */ + public void setExactHostMatch(boolean exactHostMatch) { + this.exactHostMatch = exactHostMatch; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-28 03:38:40
|
Revision: 2858 http://archive-access.svn.sourceforge.net/archive-access/?rev=2858&view=rev Author: bradtofel Date: 2009-10-28 03:38:26 +0000 (Wed, 28 Oct 2009) Log Message: ----------- INITIAL REV: simple common ops on byte arrays Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java 2009-10-28 03:38:26 UTC (rev 2858) @@ -0,0 +1,44 @@ +/* ByteOp + * + * $Id$ + * + * Created on 3:56:12 PM Dec 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of Wayback. + * + * SocksProxyCore is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * SocksProxyCore is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with SocksProxyCore; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util; + +public class ByteOp { + public static byte[] copy(byte[] src, int offset, int length) { + byte[] copy = new byte[length]; + System.arraycopy(src, offset, copy, 0, length); + return copy; + } + public static boolean cmp(byte[] input, byte[] want) { + if(input.length != want.length) { + return false; + } + for(int i = 0; i < input.length; i++) { + if(input[i] != want[i]) { + return false; + } + } + return true; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ByteOp.java ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-28 01:04:46
|
Revision: 2857 http://archive-access.svn.sourceforge.net/archive-access/?rev=2857&view=rev Author: bradtofel Date: 2009-10-28 01:04:33 +0000 (Wed, 28 Oct 2009) Log Message: ----------- INITIAL REV: couldn't find anything out there that had the right interface: produce HTTP message and headers from a raw InputStream, so this is here until a better and usable package out there is discovered.. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/BadRequestException.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpMessage.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequest.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequestMessage.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponse.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponseMessage.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/BadRequestException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/BadRequestException.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/BadRequestException.java 2009-10-28 01:04:33 UTC (rev 2857) @@ -0,0 +1,34 @@ +/* BadRequestException + * + * $Id$ + * + * Created on 3:56:12 PM Dec 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of Wayback. + * + * SocksProxyCore is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * SocksProxyCore is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with SocksProxyCore; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.http; + +import java.io.IOException; + +public class BadRequestException extends IOException { + private static final long serialVersionUID = -7123306169949959915L; + public BadRequestException(String message) { + super(message); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/BadRequestException.java ___________________________________________________________________ Added: svn:executable + * Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpMessage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpMessage.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpMessage.java 2009-10-28 01:04:33 UTC (rev 2857) @@ -0,0 +1,165 @@ +/* HttpMessage + * + * $Id$ + * + * Created on 5:48:40 PM Mar 2, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of Wayback. + * + * ProxyServletCore is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * ProxyServletCore is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with ProxyServletCore; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.http; + +import java.io.IOException; +import java.io.InputStream; + +import org.archive.wayback.util.http.BadRequestException; +import org.archive.wayback.util.ByteOp; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class HttpMessage { + private static int MAX_MESSAGE_SIZE = 4096; + public static byte SPACE = 32; + public static byte CR = 13; + public static byte LF = 10; + + public static byte[] readLine(InputStream in, int max) + throws IOException, BadRequestException { + + byte[] buffer = new byte[max]; + int pos = 0; + boolean found = false; + while(pos < max) { + int next = in.read(); + buffer[pos] = (byte) next; + if(next == LF) { + if(pos == 0) { + throw new BadRequestException( + "Message cannot start with LF"); + } + if(buffer[pos - 1] == CR) { + found = true; + break; + } + } + pos++; + } + if(!found) { + throw new BadRequestException("Message too long without CRLF"); + } + return ByteOp.copy(buffer,0,pos+1); + } + + private static int[] findSpaces(byte[] buffer, int max) + throws BadRequestException { + + int spaces[] = new int[max]; + int found = 0; + int offset = 0; + int end = buffer.length - 2; + while(offset < end) { + if(buffer[offset] == SPACE) { + spaces[found] = offset; + found++; + } + if(found == max - 1) { + break; + } + offset++; + } + if(found != max - 1) { + throw new BadRequestException("Not enough fields(" + found +") " + + "want("+max+") in (" + new String(buffer)+ ")"); + } + return spaces; + } + + public static byte[][] loadFields(byte[] buffer, int max) + throws BadRequestException { + + byte[][] fields = new byte[max][]; + int[] offsets = findSpaces(buffer, max); + int start = 0; + for(int i = 0; i < max - 1; i++) { + fields[i] = ByteOp.copy(buffer, start, offsets[i] - start); + start = offsets[i] + 1; + } + fields[max-1] = ByteOp.copy(buffer, start, (buffer.length - 2) - start); + return fields; + } + + public byte[] concatBytes(byte[][] fields, boolean addCrLf) { + int length = 1; + for(byte[] field : fields) { + length += field.length + 1; + } + if(!addCrLf) { + length -= 2; + } + byte[] buffer = new byte[length]; + int index = 0; + for(byte[] field : fields) { + System.arraycopy(field, 0, + buffer, index, field.length); + index += field.length; + if(index < length) { + buffer[index] = SPACE; + } + index++; + } + if(addCrLf) { + buffer[length - 2] = CR; + buffer[length - 1] = LF; + } + + return buffer; + } + + public static HttpResponseMessage loadResponse(byte[] buffer) + throws BadRequestException { + + byte[][] fields = loadFields(buffer,3); + + return new HttpResponseMessage(fields[0],fields[1],fields[2]); + } + + public static HttpResponseMessage loadResponse(InputStream in) + throws BadRequestException, IOException { + + return loadResponse(readLine(in, MAX_MESSAGE_SIZE)); + } + + public static HttpRequestMessage loadRequest(byte[] buffer) + throws BadRequestException { + + byte[][] fields = loadFields(buffer,3); + + return new HttpRequestMessage(fields[0],fields[1],fields[2]); + } + + public static HttpRequestMessage loadRequest(InputStream in) + throws BadRequestException, IOException { + + return loadRequest(readLine(in, MAX_MESSAGE_SIZE)); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequest.java 2009-10-28 01:04:33 UTC (rev 2857) @@ -0,0 +1,111 @@ +/* HttpRequest + * + * $Id$ + * + * Created on 4:49:10 PM Dec 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of Wayback. + * + * SocksProxyCore is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * SocksProxyCore is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with SocksProxyCore; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.http; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.archive.util.anvl.ANVLRecord; +import org.archive.wayback.util.http.BadRequestException; +import org.archive.wayback.util.ByteOp; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class HttpRequest { + + private static int MAX_HEADER_SIZE = 10240; + + private HttpRequestMessage message = null; + private ANVLRecord headers = null; + + private byte[] originalHeaders = null; + + public HttpRequest(HttpRequestMessage message, byte[] originalHeaders) + throws IOException { + + this.originalHeaders = originalHeaders; + this.message = message; + // If we want to keep the headers - we're not using them: + ByteArrayInputStream bais = new ByteArrayInputStream(originalHeaders); + headers = ANVLRecord.load(bais); + } + + /** + * @return the headers + */ + public ANVLRecord getHeaders() { + return headers; + } + + /** + * @param headers the headers to set + */ + public void setHeaders(ANVLRecord headers) { + this.headers = headers; + } + + /** + * @return the inputBytes + */ + public byte[] getOriginalHeaders() { + return originalHeaders; + } + public HttpRequestMessage getMessage() { + return message; + } + /** + * @return the method + */ + public String getMethod() { + return message.getMethod(); + } + + /** + * @return the url + */ + public String getPath() { + return message.getPath(); + } + + public static HttpRequest load(InputStream in) + throws IOException, BadRequestException { + + HttpRequestMessage message = HttpMessage.loadRequest(in); + + byte[] buffer = new byte[MAX_HEADER_SIZE]; + + int r = in.read(buffer, 0, MAX_HEADER_SIZE); + if(r == MAX_HEADER_SIZE) { + throw new BadRequestException("Request too long"); + } + return new HttpRequest(message, ByteOp.copy(buffer,0,r)); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequestMessage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequestMessage.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequestMessage.java 2009-10-28 01:04:33 UTC (rev 2857) @@ -0,0 +1,151 @@ +/* HttpRequestMessage + * + * $Id$ + * + * Created on 5:44:56 PM Mar 2, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of Wayback. + * + * ProxyServletCore is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * ProxyServletCore is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with ProxyServletCore; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.http; + +import org.archive.wayback.util.ByteOp; + + +public class HttpRequestMessage extends HttpMessage { + private static byte[] METHOD_HEAD = {'H', 'E', 'A', 'D'}; + private static byte[] METHOD_GET = {'G', 'E', 'T'}; + + private byte[] method = null; + private byte[] path = null; + private byte[] version = null; + public HttpRequestMessage(byte[] method, byte[] path, byte[] version) { + this.method = method; + this.path = path; + this.version = version; + } + public String getMethod() { + return new String(method); + } + public String getPath() { + return new String(path); + } + public String getVersion() { + return new String(version); + } + public HttpRequestMessage _clone() { + return new HttpRequestMessage(method,path,version); + } + public void setPath(byte[] path) { + this.path = path; + } + public byte[] getBytes(boolean addCrLf) { + byte[][] fields = {method,path,version}; + return concatBytes(fields,addCrLf); + } + +// public byte[] getBytes() { +// // ' ' + ' ' + \r\n = 4 +// int length = path.length + method.length + version.length + 4; +// int versionStart = path.length + method.length + 2; +// +// byte[] buffer = new byte[length]; +// +// System.arraycopy(method, 0, +// buffer, 0, method.length); +// +// buffer[method.length] = SPACE; +// +// System.arraycopy(path, 0, +// buffer, method.length + 1, path.length); +// +// buffer[versionStart - 1] = SPACE; +// +// System.arraycopy(version, 0, +// buffer, versionStart, version.length); +// buffer[versionStart + version.length] = CR; +// buffer[versionStart + version.length + 1] = LF; +// +// return buffer; +// } + + public boolean isHead() { + return ByteOp.cmp(method,METHOD_HEAD); + } + public boolean isGet() { + return ByteOp.cmp(method,METHOD_GET); + } + +// public static HttpRequestMessage load(InputStream in) +// throws BadRequestException, IOException { +// return load(HttpMessage.readLine(in,MAX_SIZE)); +// } +// +// public static HttpRequestMessage load(byte[] buffer) +// throws BadRequestException { +// +// byte[] method = null; +// byte[] path = null; +// byte[] version = null; +// +// int length = buffer.length; +// int end = length - 2; +// int firstSpace = 0; +// int lastSpace = end; +// +// +// +// // make sure ends in CRLF: +// if((buffer[length - 2] != CR) +// || (buffer[length - 1] != LF)) { +// +// throw new BadRequestException("Bed end of Message(no CRLF): " +// + new String(buffer)); +// } +// +// // find first ' ' (after METHOD): +// while(firstSpace < end) { +// if(buffer[firstSpace] == SPACE) { +// method = ByteOp.copy(buffer, 0, firstSpace); +// break; +// } +// firstSpace++; +// } +// +// // find last ' ' (before VERSION): +// while(lastSpace > firstSpace) { +// if(buffer[lastSpace] == SPACE) { +// version = ByteOp.copy(buffer, lastSpace + 1, end - (lastSpace+1)); +// break; +// } +// lastSpace--; +// } +// path = ByteOp.copy(buffer, firstSpace + 1, (lastSpace - firstSpace) - 1); +// // make sure path has no spaces: +// int position = 0; +// while(position < path.length) { +// if(path[position] == SPACE) { +// throw new BadRequestException("Too many fields in Message: " +// + new String(buffer)); +// } +// position++; +// } +//// version = "HTTP/1.0".getBytes(); +// return new HttpRequestMessage(method, path, version); +// } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpRequestMessage.java ___________________________________________________________________ Added: svn:executable + * Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponse.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponse.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponse.java 2009-10-28 01:04:33 UTC (rev 2857) @@ -0,0 +1,60 @@ +/* HttpResponse + * + * $Id$ + * + * Created on 5:44:56 PM Mar 2, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of Wayback. + * + * ProxyServletCore is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * ProxyServletCore is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with ProxyServletCore; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.http; + +import java.io.IOException; +import java.io.InputStream; + +import org.archive.util.anvl.ANVLRecord; +import org.archive.wayback.util.http.BadRequestException; + +public class HttpResponse { + private HttpResponseMessage message = null; + private ANVLRecord headers = null; + private InputStream bodyInputStream = null; + public HttpResponse(HttpResponseMessage message, ANVLRecord headers, + InputStream bodyInputStream) { + + this.message = message; + this.headers = headers; + this.bodyInputStream = bodyInputStream; + } + public HttpResponseMessage getMessage() { + return message; + } + public ANVLRecord getHeaders() { + return headers; + } + public InputStream getBodyInputStream() { + return bodyInputStream; + } + public static HttpResponse load(InputStream in) + throws BadRequestException, IOException { + + HttpResponseMessage message = HttpMessage.loadResponse(in); + ANVLRecord headers = ANVLRecord.load(in); + return new HttpResponse(message,headers,in); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponse.java ___________________________________________________________________ Added: svn:executable + * Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponseMessage.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponseMessage.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/http/HttpResponseMessage.java 2009-10-28 01:04:33 UTC (rev 2857) @@ -0,0 +1,62 @@ +/* HttpResponseMessage + * + * $Id$ + * + * Created on 5:44:56 PM Mar 2, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of Wayback. + * + * ProxyServletCore is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * ProxyServletCore is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with ProxyServletCore; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.http; + +import org.archive.wayback.util.ByteOp; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class HttpResponseMessage extends HttpMessage { + private static byte[] HTTP_304 = {'3', '0', '4'}; + private byte[] version = null; + private byte[] code = null; + private byte[] text = null; + public HttpResponseMessage(byte[] version, byte[] code, byte[] text) { + this.version = version; + this.code = code; + this.text = text; + } + public String getVersion() { + return new String(version); + } + public String getCode() { + return new String(code); + } + public String getText() { + return new String(text); + } + public boolean isNotModified() { + return ByteOp.cmp(code, HTTP_304); + } + public byte[] getBytes(boolean addCrLf) { + byte[][] fields = {version,code,text}; + return concatBytes(fields, addCrLf); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 00:30:32
|
Revision: 2856 http://archive-access.svn.sourceforge.net/archive-access/?rev=2856&view=rev Author: binzino Date: 2009-10-28 00:30:16 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Updated for Nutch 1.0 from Nutch 1.0-dev. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/build.xml Modified: trunk/archive-access/projects/nutchwax/archive/build.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/build.xml 2009-10-28 00:29:23 UTC (rev 2855) +++ trunk/archive-access/projects/nutchwax/archive/build.xml 2009-10-28 00:30:16 UTC (rev 2856) @@ -23,7 +23,7 @@ <property name="lib.dir" value="lib" /> <property name="build.dir" value="${nutch.dir}/build" /> <!-- HACK: Need to import default.properties like Nutch does --> - <property name="final.name" value="nutch-1.0-dev" /> + <property name="final.name" value="nutch-1.0" /> <property name="dist.dir" value="${build.dir}/${final.name}" /> <target name="nutch-compile-core"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |