From: John W. <jwe...@us...> - 2010-06-16 19:11:26
|
Update of /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer In directory sfp-cvsdas-3.v30.ch3.sourceforge.com:/tmp/cvs-serv28375/src/org/dlese/dpc/index/writer Modified Files: FileIndexingServiceWriter.java IndexingTools.java SimpleXMLFileIndexingWriter.java Log Message: cleaned up methods a bit Index: SimpleXMLFileIndexingWriter.java =================================================================== RCS file: /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer/SimpleXMLFileIndexingWriter.java,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** SimpleXMLFileIndexingWriter.java 20 Mar 2009 23:33:53 -0000 1.20 --- SimpleXMLFileIndexingWriter.java 16 Jun 2010 19:11:18 -0000 1.21 *************** *** 14,18 **** * All rights reserved. */ - package org.dlese.dpc.index.writer; --- 14,17 ---- *************** *** 31,38 **** /** ! * Creates a Lucene {@link org.apache.lucene.document.Document} from any valid XML file by stripping the XML ! * tags to extract and index the content. The full content is indexed in the default field and is stemmed and ! * indexed in the stems field. The reader for this type of Document is XMLDocReader. This is the default ! * writer for generic XML formats. * * @author John Weatherley --- 30,38 ---- /** ! * This is the default writer for generic XML formats. Creates a Lucene {@link ! * org.apache.lucene.document.Document} from any valid XML file by stripping the XML tags to extract and ! * index the content. The full content of all Elements and Attributes is indexed in the default and ! * admindefault fields and is stemmed and indexed in the stems field. The reader for this type of Document is ! * XMLDocReader. * * @author John Weatherley Index: FileIndexingServiceWriter.java =================================================================== RCS file: /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer/FileIndexingServiceWriter.java,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** FileIndexingServiceWriter.java 12 Feb 2010 02:13:10 -0000 1.51 --- FileIndexingServiceWriter.java 16 Jun 2010 19:11:18 -0000 1.52 *************** *** 14,18 **** * All rights reserved. */ - package org.dlese.dpc.index.writer; --- 14,17 ---- *************** *** 73,79 **** private static boolean debug = false; private boolean validateFiles = false; - private StringBuffer defaultBuffer = null; - private StringBuffer adminDefaultBuffer = null; - private String adminDefaultFieldName = "admindefault"; private boolean abortIndexing = false; private FileIndexingService fileIndexingService = null; --- 72,75 ---- *************** *** 81,85 **** private File dir = null; private String fileContent = null; ! private Document existingDoc = null; private FileIndexingServiceData newDocData = null; private FileIndexingPlugin fileIndexingPlugin = null; --- 77,82 ---- private File dir = null; private String fileContent = null; ! private Document luceneDoc = null; ! private Document previousRecordDoc = null; private FileIndexingServiceData newDocData = null; private FileIndexingPlugin fileIndexingPlugin = null; *************** *** 93,99 **** /** * Gets a unique document type key for this kind of record, corresponding to the format type. In the DLESE ! * metadata repository, this corresponds to the XML format, for example "oai_dc," "adn," "dlese_ims," or "dlese_anno". ! * The string is parsed using the Lucene {@link org.apache.lucene.analysis.standard.StandardAnalyzer} so it ! * must be lowercase and should not contain any stop words. * * @return The docType String --- 90,96 ---- /** * Gets a unique document type key for this kind of record, corresponding to the format type. In the DLESE ! * metadata repository, this corresponds to the XML format, for example "oai_dc," "adn," "dlese_ims," or ! * "dlese_anno". The string is parsed using the Lucene {@link org.apache.lucene.analysis.standard.StandardAnalyzer} ! * so it must be lowercase and should not contain any stop words. * * @return The docType String *************** *** 130,138 **** * * @param source The source file being indexed ! * @param existingDoc An existing Document that currently resides in the index for the given resource, or * null if none was previously present * @exception Exception If an error occured during set-up. */ ! public abstract void init(File source, Document existingDoc) throws Exception; --- 127,135 ---- * * @param source The source file being indexed ! * @param previousRecordDoc An existing Document that currently resides in the index for the given resource, or * null if none was previously present * @exception Exception If an error occured during set-up. */ ! public abstract void init(File source, Document previousRecordDoc) throws Exception; *************** *** 156,160 **** * * Example code:<br> ! * <code>protected void addCustomFields(Document newDoc, Document existingDoc) throws Exception {</code> * <br> * <code> String customContent = "Some content";</code><br> --- 153,157 ---- * * Example code:<br> ! * <code>protected void addCustomFields(Document newDoc, Document previousRecordDoc) throws Exception {</code> * <br> * <code> String customContent = "Some content";</code><br> *************** *** 164,168 **** * @param newDoc The new {@link org.apache.lucene.document.Document} that is being created for this * resource ! * @param existingDoc An existing {@link org.apache.lucene.document.Document} that currently resides in * the index for the given resource, or null if none was previously present * @param sourceFile The sourceFile that is being indexed --- 161,165 ---- * @param newDoc The new {@link org.apache.lucene.document.Document} that is being created for this * resource ! * @param previousRecordDoc An existing {@link org.apache.lucene.document.Document} that currently resides in * the index for the given resource, or null if none was previously present * @param sourceFile The sourceFile that is being indexed *************** *** 170,174 **** * occurs. */ ! protected abstract void addCustomFields(Document newDoc, Document existingDoc, File sourceFile) throws Exception; --- 167,171 ---- * occurs. */ ! protected abstract void addCustomFields(Document newDoc, Document previousRecordDoc, File sourceFile) throws Exception; *************** *** 185,199 **** if (fileContent == null) { if (isMakingDeletedDoc()) ! fileContent = existingDoc.get("filecontent"); else if (source.exists()) fileContent = Files.readFileToEncoding(source, "UTF-8").toString(); ! if (fileContent == null && existingDoc != null) ! fileContent = existingDoc.get("filecontent"); if (fileContent == null) fileContent = ""; ! // Remove the BOM character, if present, since it crashes the XML processors (Windows notepad places it there when saving files as UTF-8): ! if(fileContent.startsWith("\uFEFF")) ! fileContent = fileContent.replaceFirst("\uFEFF",""); } return fileContent; --- 182,196 ---- if (fileContent == null) { if (isMakingDeletedDoc()) ! fileContent = previousRecordDoc.get("filecontent"); else if (source.exists()) fileContent = Files.readFileToEncoding(source, "UTF-8").toString(); ! if (fileContent == null && previousRecordDoc != null) ! fileContent = previousRecordDoc.get("filecontent"); if (fileContent == null) fileContent = ""; ! // Remove the BOM character, if present, since it crashes the XML processors (Windows notepad places it there when saving files as UTF-8): ! if (fileContent.startsWith("\uFEFF")) ! fileContent = fileContent.replaceFirst("\uFEFF", ""); } return fileContent; *************** *** 265,275 **** return dir; } ! ! /** ! * @return The existingDoc value */ ! public Document getExistingDoc() { ! return existingDoc; } --- 262,285 ---- return dir; } ! /** ! * Gets the Lucene Document that this Writer is building. ! * ! * @return The Lucene Document */ ! public Document getLuceneDoc() { ! if(luceneDoc == null) ! luceneDoc = new Document(); ! return luceneDoc; ! } ! ! /** ! * Gets the previous Document that currently resides in the index for the given resource, or null if none was ! * previously present. ! * ! * @return The previousRecordDoc value ! */ ! public Document getPreviousRecordDoc() { ! return previousRecordDoc; } *************** *** 335,339 **** } - /** * Adds the given String to the 'default' and 'stems' fields as text and stemmed text, respectively. The --- 345,348 ---- *************** *** 345,352 **** protected void addToDefaultField(String value) { if (value != null && value.trim().length() > 0) ! defaultBuffer.append(value).append(IndexingTools.PHRASE_SEPARATOR); } - /** * Adds the given String to a text field referenced in the index by the field name 'admindefault'. The --- 354,360 ---- protected void addToDefaultField(String value) { if (value != null && value.trim().length() > 0) ! IndexingTools.addToDefaultAndStemsFields(getLuceneDoc(), value); } /** * Adds the given String to a text field referenced in the index by the field name 'admindefault'. The *************** *** 358,365 **** protected void addToAdminDefaultField(String value) { if (value != null && value.trim().length() > 0) ! adminDefaultBuffer.append(value).append(IndexingTools.PHRASE_SEPARATOR); } - /** * Creates a Lucene {@link org.apache.lucene.document.Document} equal to the exsiting FileIndexingService --- 366,372 ---- protected void addToAdminDefaultField(String value) { if (value != null && value.trim().length() > 0) ! IndexingTools.addToAdminDefaultField(getLuceneDoc(), value); } /** * Creates a Lucene {@link org.apache.lucene.document.Document} equal to the exsiting FileIndexingService *************** *** 371,387 **** * #getIsMakingDeletedDoc} to execute as appropriate. * ! * @param existingDoc An existing FileIndexingService Document that currently resides in the index for * the given file * @return A Lucene FileIndexingService Document with appropriate fields updated * @exception Throwable Thrown if error occurs */ ! public synchronized Document getDeletedDoc(Document existingDoc) throws Throwable { ! if (existingDoc == null) throw new Exception("getDeletedDoc(): the existing doc is null"); isMakingDeletedDoc = true; ! return existingDoc; } --- 378,394 ---- * #getIsMakingDeletedDoc} to execute as appropriate. * ! * @param previousRecordDoc An existing FileIndexingService Document that currently resides in the index for * the given file * @return A Lucene FileIndexingService Document with appropriate fields updated * @exception Throwable Thrown if error occurs */ ! public synchronized Document getDeletedDoc(Document previousRecordDoc) throws Throwable { ! if (previousRecordDoc == null) throw new Exception("getDeletedDoc(): the existing doc is null"); isMakingDeletedDoc = true; ! return previousRecordDoc; } *************** *** 470,474 **** if (sourceFile != null) this.dir = source.getParentFile(); ! this.existingDoc = existingLuceneDoc; /* --- 477,481 ---- if (sourceFile != null) this.dir = source.getParentFile(); ! this.previousRecordDoc = existingLuceneDoc; /* *************** *** 478,487 **** * correct DocReader */ ! defaultBuffer = new StringBuffer(); ! adminDefaultBuffer = new StringBuffer(); if (newDocData == null) newDocData = new FileIndexingServiceData(); newDocData.clearAll(); ! init(source, existingDoc); if (abortIndexing) { destroy(); --- 485,493 ---- * correct DocReader */ ! if (newDocData == null) newDocData = new FileIndexingServiceData(); newDocData.clearAll(); ! init(source, previousRecordDoc); if (abortIndexing) { destroy(); *************** *** 489,493 **** } ! Document doc = new Document(); // A field/term that matches all records: --- 495,499 ---- } ! Document doc = getLuceneDoc(); // A field/term that matches all records: *************** *** 537,543 **** doc.add(new Field("readerclass", getReaderClass(), Field.Store.YES, Field.Index.TOKENIZED)); ! // ------- Additional fields that are unique to the framework being indexed ------- ! addCustomFields(doc, existingDoc, source); // ------- Add file validation information, if apporpriate ------- --- 543,549 ---- doc.add(new Field("readerclass", getReaderClass(), Field.Store.YES, Field.Index.TOKENIZED)); ! // ------- Index all standard and custom fields for the framework being indexed by subclasses ------- ! addCustomFields(doc, previousRecordDoc, source); // ------- Add file validation information, if apporpriate ------- *************** *** 560,567 **** // Index the 'default' and 'stems' fields. See class JavaDoc for details on this field. ! IndexingTools.indexDefaultAndStemsFields(doc, defaultBuffer.toString()); // Admin default field ! doc.add(new Field(adminDefaultFieldName, adminDefaultBuffer.toString(), Field.Store.NO, Field.Index.TOKENIZED)); if (abortIndexing) { --- 566,573 ---- // Index the 'default' and 'stems' fields. See class JavaDoc for details on this field. ! //IndexingTools.indexDefaultAndStemsFields(doc, defaultBuffer.toString()); // Admin default field ! //doc.add(new Field(adminDefaultFieldName, adminDefaultBuffer.toString(), Field.Store.NO, Field.Index.TOKENIZED)); if (abortIndexing) { Index: IndexingTools.java =================================================================== RCS file: /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer/IndexingTools.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** IndexingTools.java 20 Mar 2009 23:33:53 -0000 1.10 --- IndexingTools.java 16 Jun 2010 19:11:18 -0000 1.11 *************** *** 14,18 **** * All rights reserved. */ - package org.dlese.dpc.index.writer; --- 14,17 ---- *************** *** 36,42 **** public class IndexingTools { ! private static final String defaultFieldName = "default"; ! private static final String stemsFieldName = "stems"; ! /** * String used to separate and preserve phrases indexed as text, includes leading and trailing white space. --- 35,45 ---- public class IndexingTools { ! /** Default field 'default' */ ! public final static String defaultFieldName = "default"; ! /** Stems field 'stems' */ ! public final static String stemsFieldName = "stems"; ! /** Admin default field 'admindefault' */ ! public final static String adminDefaultFieldName = "admindefault"; ! /** * String used to separate and preserve phrases indexed as text, includes leading and trailing white space. *************** *** 45,56 **** ! public final static void indexDefaultAndStemsFields(Document myDoc, String content) { // See class JavaDoc for details on this field. ! myDoc.add(new Field(defaultFieldName, content,Field.Store.NO, Field.Index.TOKENIZED)); // Note that the Analyzer will handle converting the tokens in this field to their stem form. ! myDoc.add(new Field(stemsFieldName, content,Field.Store.NO, Field.Index.TOKENIZED)); } ! ! /** * Creates a String separated by the phrase separator term from the text of each of the Element or --- 48,76 ---- ! /** ! * Indexes the given text into the default and stems fields. ! * ! * @param myDoc Document to add to ! * @param content Content to add ! */ ! public final static void addToDefaultAndStemsFields(Document myDoc, String content) { // See class JavaDoc for details on this field. ! myDoc.add(new Field(defaultFieldName, content, Field.Store.NO, Field.Index.TOKENIZED)); // Note that the Analyzer will handle converting the tokens in this field to their stem form. ! myDoc.add(new Field(stemsFieldName, content, Field.Store.NO, Field.Index.TOKENIZED)); } ! ! ! /** ! * Indexes the given text into the admin default field. ! * ! * @param myDoc Document to add to ! * @param content Content to add ! */ ! public final static void addToAdminDefaultField(Document myDoc, String content) { ! myDoc.add(new Field(adminDefaultFieldName, content, Field.Store.NO, Field.Index.TOKENIZED)); ! } ! ! /** * Creates a String separated by the phrase separator term from the text of each of the Element or *************** *** 267,279 **** * to the field, the values are Lists of the terms in that field. * ! * @param textToParse The text to analyze with the analyzer ! * @param analyzer The analyzer to use ! * @param field DESCRIPTION ! * @return The terms HashMap generated by the analyzer */ /* public final static HashMap getAnalyzedTermsHashMap(String textToParse, String defaultField, Analyzer analyzer) { Reader reader = new StringReader(textToParse); TokenStream in = analyzer.tokenStream((defaultField == null ? "default" : defaultField), reader); - HashMap map = new HashMap(); ArrayList termList = new ArrayList(); --- 287,298 ---- * to the field, the values are Lists of the terms in that field. * ! * @param textToParse The text to analyze with the analyzer ! * @param analyzer The analyzer to use ! * @param field DESCRIPTION ! * @return The terms HashMap generated by the analyzer */ /* public final static HashMap getAnalyzedTermsHashMap(String textToParse, String defaultField, Analyzer analyzer) { Reader reader = new StringReader(textToParse); TokenStream in = analyzer.tokenStream((defaultField == null ? "default" : defaultField), reader); HashMap map = new HashMap(); ArrayList termList = new ArrayList(); *************** *** 291,295 **** return (String[]) termList.toArray(new String[]{}); } */ - /** * Creates a StringBuffer to display the tokens created by a given analyzer. Output is of the form: [token1] --- 310,313 ---- |