Update of /cvsroot/babeldoc/babeldoc/modules/core/src/com/babeldoc/core/pipeline/stage In directory sc8-pr-cvs1:/tmp/cvs-serv17111 Modified Files: DomifyPipelineStage.java XpathSplitterPipelineStage.java Added Files: EntityResolverDiskCache.java EntityResolverMemoryCache.java Log Message: added XML Entity Resolving (local dtd cache), preemptive DOM caching of documents out of XpathSplitter. Changed DOM caching implementation from document attribute to WeakHashMap in DomifyPipelineStage, as the previous exposed a java serialization bug when journaling large XML documents. --- NEW FILE: EntityResolverDiskCache.java --- package com.babeldoc.core.pipeline.stage; import java.util.*; import java.io.*; import org.xml.sax.*; import java.net.URL; // TODO: make a maximum size of all the files on disk // TODO: eliminate extra read from disk on initial population // TODO: figure out cache invalidation // TODO: make a better filename scheme than hashcode // TODO: read the filenames in the directory in the constructor to avoid checking for files every time. /** * EntityResolverDiskCache is a disk cache for resolving XML entities. * If an entity is in the disk cache, it is resolved from there; otherwise * it is resolved from the backend and stored in the disk cache. * The disk cache is just a directory that is full of files. **/ public class EntityResolverDiskCache implements EntityResolver { /** * Construct a new EntityResolverDiskCache that backends to the given EntityResolver and * caches its entities in the given directory. **/ public EntityResolverDiskCache(EntityResolver entityResolver, String baseDirectory) throws Exception { this.entityResolver = entityResolver; this.baseDirectory = new File(baseDirectory); if (!this.baseDirectory.isDirectory()) { throw new Exception("Not a directory."); } } /** * resolve an entity **/ public InputSource resolveEntity (String publicId, String systemId) throws SAXException, IOException { try { File file = null; // synchronize on the systemId so two attempts to resolve don't result in two gets from the backend synchronized (systemId.intern()) { file = new File(baseDirectory, Integer.toString(systemId.hashCode())); if (!file.exists()) { InputSource inputSource = entityResolver.resolveEntity(publicId, systemId); if (inputSource != null) { Reader reader = inputSource.getCharacterStream(); if (reader != null) { FileWriter fw = new FileWriter(file); char[] buffer = new char[1024]; int charsRead; while ((charsRead = reader.read(buffer)) != 01) { fw.write(buffer, 0, charsRead); } fw.close(); reader.close(); } else { InputStream byteStream = inputSource.getByteStream(); if (byteStream != null) { FileOutputStream outputStream = new FileOutputStream(file); byte[] buffer = new byte[1024]; int bytesRead; while ((bytesRead = byteStream.read(buffer)) != -1) { outputStream.write(buffer, 0, bytesRead); } outputStream.close(); byteStream.close(); } } } if (!file.exists()) { URL url = new URL(systemId); InputStream inputStream = url.openStream(); FileOutputStream outputStream = new FileOutputStream(file); byte[] buffer = new byte[1024]; int bytesRead; while ((bytesRead = inputStream.read(buffer)) != -1) { outputStream.write(buffer, 0, bytesRead); } outputStream.close(); inputStream.close(); } } } InputSource is = new InputSource(new FileInputStream(file)); is.setPublicId(publicId); is.setSystemId(systemId); return is; } catch (RuntimeException t) { t.printStackTrace(); throw t; } } /** the EntityResolver to backend to **/ EntityResolver entityResolver; /** the directory to use as a cache **/ File baseDirectory; } --- NEW FILE: EntityResolverMemoryCache.java --- package com.babeldoc.core.pipeline.stage; import java.util.*; import java.io.*; import org.xml.sax.*; import java.net.URL; import org.xml.sax.helpers.DefaultHandler; // TODO: make a maximum size for the cache (number of entries or total size in bytes) // TODO: figure out cache invalidation /** * EntityResolverMemoryCache is a memory cache for resolving XML entities. * If an entity is in the memory cache, it is resolved from there; otherwise * it is resolved normally and stored in the memory cache for the next time. **/ public class EntityResolverMemoryCache implements EntityResolver { /** an EntityResolver for babeldoc to use; a memory cache backended to a disk cache **/ static EntityResolver babeldocEntityResolver = null; static { try { babeldocEntityResolver = new EntityResolverMemoryCache(new EntityResolverDiskCache(new DefaultHandler(), "dtdcache")); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } } /** * Construct a new EntityResolverMemoryCache that backends to the given EntityResolver **/ public EntityResolverMemoryCache(EntityResolver entityResolver) { this.entityResolver = entityResolver; } /** * resolve an entity **/ public InputSource resolveEntity (String publicId, String systemId) throws SAXException, IOException { try { byte[] bytes = null; // synchronize on the systemId so two attempts to resolve don't result in two gets from the backend synchronized (systemId.intern()) { synchronized (cache) { // synchronize all access on the cache bytes = (byte[])cache.get(systemId); } if (bytes == null) { // wasn't cached; resolve it and put it there InputSource inputSource = entityResolver.resolveEntity(publicId, systemId); if (inputSource != null) { // try to read it as a character steam, if not a byte stream Reader reader = inputSource.getCharacterStream(); if (reader != null) { CharArrayWriter caw = new CharArrayWriter(); char[] buffer = new char[1024]; int charsRead; while ((charsRead = reader.read(buffer)) != 01) { caw.write(buffer, 0, charsRead); } caw.close(); reader.close(); String temp = caw.toString(); bytes = temp.getBytes(); } else { InputStream byteStream = inputSource.getByteStream(); if (byteStream != null) { ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int bytesRead; while ((bytesRead = byteStream.read(buffer)) != -1) { outputStream.write(buffer, 0, bytesRead); } outputStream.close(); byteStream.close(); bytes = outputStream.toByteArray(); } } } if (bytes == null) { // read it as a URL URL url = new URL(systemId); InputStream inputStream = url.openStream(); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int bytesRead; while ((bytesRead = inputStream.read(buffer)) != -1) { outputStream.write(buffer, 0, bytesRead); } outputStream.close(); inputStream.close(); bytes = outputStream.toByteArray(); } synchronized(cache) { // store it in the cache cache.put(systemId, bytes); } } } InputSource is = new InputSource(new ByteArrayInputStream(bytes)); is.setPublicId(publicId); is.setSystemId(systemId); return is; } catch (RuntimeException t) { t.printStackTrace(); throw t; } } /** the cache is just a LinkedHashMap **/ LinkedHashMap cache = new LinkedHashMap(); /** the EntityResolver to backend to **/ EntityResolver entityResolver; } Index: DomifyPipelineStage.java =================================================================== RCS file: /cvsroot/babeldoc/babeldoc/modules/core/src/com/babeldoc/core/pipeline/stage/DomifyPipelineStage.java,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** DomifyPipelineStage.java 27 Jun 2003 02:19:59 -0000 1.11 --- DomifyPipelineStage.java 24 Jul 2003 14:06:13 -0000 1.12 *************** *** 72,75 **** --- 72,77 ---- import com.babeldoc.core.pipeline.*; import com.babeldoc.core.pipeline.command.PipelineFeeder; + import com.babeldoc.core.config.ConfigService; + import com.babeldoc.core.config.IConfig; import org.w3c.dom.Document; *************** *** 86,89 **** --- 88,94 ---- import java.util.ArrayList; import java.util.Collection; + import java.util.WeakHashMap; + import java.util.Map; + import java.util.Collections; import javax.xml.parsers.DocumentBuilder; *************** *** 112,118 **** public static final String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; ! /** Constants: attribute name */ ! public static String DOM_KEY = "dom_representation"; ! /** * construct with configuration information object --- 117,162 ---- public static final String JAXP_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; ! /** constants for the configuration stuff */ ! public static final String CONFIG_FILENAME = "domify/config"; ! public static final String USE_DTD_CACHE_NAME = "useDtdCache"; ! public static boolean USE_DTD_CACHE; ! ! /** cache document representations here rather than as document attributes **/ ! private static Map domCache = Collections.synchronizedMap(new WeakHashMap()); ! ! /** keep a DocumentBuilder per Thread **/ ! public static ThreadLocal docBuilder = new ThreadLocal() { ! public Object initialValue() { ! synchronized(DocumentBuilderFactory.class) { ! DocumentBuilder builder = null; ! try { ! DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance(); ! dfactory.setNamespaceAware(true); ! builder = dfactory.newDocumentBuilder(); ! if (USE_DTD_CACHE) { ! builder.setEntityResolver(EntityResolverMemoryCache.babeldocEntityResolver); ! } ! } ! catch (Exception e) { ! com.babeldoc.core.LogService.getInstance().logError("docBuilder.initialValue()", e); ! } ! return builder; ! } ! } ! }; ! ! ! static { ! try { ! /** ! * configuation variables ! */ ! IConfig config = ConfigService.getInstance().getConfig(CONFIG_FILENAME); ! USE_DTD_CACHE = "true".equalsIgnoreCase(config.getString(USE_DTD_CACHE_NAME)); ! } catch (Exception e) { ! com.babeldoc.core.LogService.getInstance().logError("Static{}", e); ! } ! } ! /** * construct with configuration information object *************** *** 150,153 **** --- 194,206 ---- // } + public static Document getCachedDom(PipelineDocument document) { + Document doc = (Document)domCache.get(document); + return doc; + } + + public static void putCachedDom(PipelineDocument document, Document doc) { + domCache.put(document, doc); + } + /** * Process the xml document to a document *************** *** 159,163 **** */ public static Document parseToDom(PipelineDocument document, boolean force) { ! Document doc = (Document) document.get(DOM_KEY); try { --- 212,216 ---- */ public static Document parseToDom(PipelineDocument document, boolean force) { ! Document doc = getCachedDom(document); try { *************** *** 175,182 **** } ! DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance(); ! dfactory.setNamespaceAware(true); ! doc = dfactory.newDocumentBuilder().parse(xmlSource); ! document.put(DOM_KEY, doc); } } catch (Exception e) { --- 228,234 ---- } ! DocumentBuilder builder = (DocumentBuilder)docBuilder.get(); ! doc = builder.parse(xmlSource); ! putCachedDom(document, doc); } } catch (Exception e) { *************** *** 244,248 **** public Document parseToDom(PipelineDocument document, boolean force, boolean validate, String schemaFilePath) throws Exception { ! Document doc = (Document) document.get(DOM_KEY); try { --- 296,300 ---- public Document parseToDom(PipelineDocument document, boolean force, boolean validate, String schemaFilePath) throws Exception { ! Document doc = getCachedDom(document); try { *************** *** 287,290 **** --- 339,345 ---- DocumentBuilder builder = dfactory.newDocumentBuilder(); + if (USE_DTD_CACHE) { + builder.setEntityResolver(EntityResolverMemoryCache.babeldocEntityResolver); + } //We have to provide an error handler for parser. It will be used *************** *** 357,361 **** } ! document.put(DOM_KEY, doc); setAdditionalInfo(errorHandler.getErrorDescription()); } --- 412,416 ---- } ! putCachedDom(document, doc); setAdditionalInfo(errorHandler.getErrorDescription()); } Index: XpathSplitterPipelineStage.java =================================================================== RCS file: /cvsroot/babeldoc/babeldoc/modules/core/src/com/babeldoc/core/pipeline/stage/XpathSplitterPipelineStage.java,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** XpathSplitterPipelineStage.java 27 Jun 2003 02:19:59 -0000 1.5 --- XpathSplitterPipelineStage.java 24 Jul 2003 14:06:14 -0000 1.6 *************** *** 92,95 **** --- 92,96 ---- import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; + import javax.xml.parsers.DocumentBuilder; *************** *** 152,161 **** TransformerFactory transformFactory = TransformerFactory.newInstance(); ! String[] results = applyXpath(transformFactory, doc, xpath); if ((results == null) || (results.length == 0)) { return super.processHelper(); } else { ! return super.processHelper("text/xml", results); } } catch (Exception x) { --- 153,171 ---- TransformerFactory transformFactory = TransformerFactory.newInstance(); ! Node[] nodes = applyXpath(doc, xpath); ! String[] results = serialize(transformFactory, nodes); if ((results == null) || (results.length == 0)) { return super.processHelper(); } else { ! PipelineStageResult[] psr = super.processHelper("text/xml", results); ! DocumentBuilder documentBuilder = (DocumentBuilder)DomifyPipelineStage.docBuilder.get(); ! for (int i = 0; i < psr.length; i++) { ! Document splitDoc = documentBuilder.newDocument(); ! Node importedNode = splitDoc.importNode(nodes[i], true); ! splitDoc.appendChild(importedNode); ! DomifyPipelineStage.putCachedDom(psr[i].getDocument(), splitDoc); ! } ! return psr; } } catch (Exception x) { *************** *** 182,193 **** } ! /** ! * ! */ ! private String[] applyXpath(TransformerFactory transformFactory, ! Document doc, String xpath) throws Exception { boolean xmlOmit = false; ! boolean xmlIndent = false; ! // Load the configuration data if (this.hasOption(OMIT_XML_DECL)) { --- 192,206 ---- } ! private String[] serialize(TransformerFactory transformFactory, Node[] nodes) ! throws Exception ! { ! if (nodes == null) { ! return null; ! } ! String[] results = new String[nodes.length]; ! boolean xmlOmit = false; ! boolean xmlIndent = false; ! // Load the configuration data if (this.hasOption(OMIT_XML_DECL)) { *************** *** 198,205 **** xmlIndent = Boolean.getBoolean(getOptions(XML_INDENT)); } ! // Set up an identity transformer to use as serializer. Transformer serializer = transformFactory.newTransformer(); ! if (xmlOmit) { serializer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); --- 211,218 ---- xmlIndent = Boolean.getBoolean(getOptions(XML_INDENT)); } ! // Set up an identity transformer to use as serializer. Transformer serializer = transformFactory.newTransformer(); ! if (xmlOmit) { serializer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); *************** *** 209,213 **** serializer.setOutputProperty(OutputKeys.INDENT, "yes"); } ! NodeIterator nl = XPathAPI.selectNodeIterator(doc, xpath); --- 222,241 ---- serializer.setOutputProperty(OutputKeys.INDENT, "yes"); } ! ! ByteArrayOutputStream baos = new ByteArrayOutputStream(); ! for (int i = 0; i < nodes.length; i++) { ! baos.reset(); ! serializer.transform(new DOMSource(nodes[i]), new StreamResult(baos)); ! results[i] = baos.toString(); ! } ! ! return results; ! } ! ! /** ! * ! */ ! private Node[] applyXpath(Document doc, String xpath) throws Exception { ! NodeIterator nl = XPathAPI.selectNodeIterator(doc, xpath); *************** *** 217,223 **** while ((n = nl.nextNode()) != null) { if (!isTextNode(n)) { ! ByteArrayOutputStream baos = new ByteArrayOutputStream(); ! serializer.transform(new DOMSource(n), new StreamResult(baos)); ! vec.addElement(baos.toString()); } } --- 245,249 ---- while ((n = nl.nextNode()) != null) { if (!isTextNode(n)) { ! vec.addElement(n); } } *************** *** 225,229 **** // Return the results vector into an array and return it. if (vec.size() > 0) { ! String[] results = new String[vec.size()]; vec.copyInto(results); --- 251,255 ---- // Return the results vector into an array and return it. if (vec.size() > 0) { ! Node[] results = new Node[vec.size()]; vec.copyInto(results); |