[Dlsciences-dlese-tools] dlese-tools-project/src/org/dlese/dpc/index/writer FileIndexingServiceWri

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer
In directory sfp-cvsdas-3.v30.ch3.sourceforge.com:/tmp/cvs-serv28375/src/org/dlese/dpc/index/writer

Modified Files:
	FileIndexingServiceWriter.java IndexingTools.java 
	SimpleXMLFileIndexingWriter.java 
Log Message:
cleaned up methods a bit

Index: SimpleXMLFileIndexingWriter.java
===================================================================
RCS file: /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer/SimpleXMLFileIndexingWriter.java,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -d -r1.20 -r1.21
*** SimpleXMLFileIndexingWriter.java	20 Mar 2009 23:33:53 -0000	1.20
--- SimpleXMLFileIndexingWriter.java	16 Jun 2010 19:11:18 -0000	1.21
***************
*** 14,18 ****
   *  All rights reserved.
   */
- 
  package org.dlese.dpc.index.writer;

--- 14,17 ----
***************
*** 31,38 ****

  /**
!  *  Creates a Lucene {@link org.apache.lucene.document.Document} from any valid XML file by stripping the XML
!  *  tags to extract and index the content. The full content is indexed in the default field and is stemmed and
!  *  indexed in the stems field. The reader for this type of Document is XMLDocReader. This is the default
!  *  writer for generic XML formats.
   *
   * @author    John Weatherley
--- 30,38 ----

  /**
!  *  This is the default writer for generic XML formats. Creates a Lucene {@link
!  *  org.apache.lucene.document.Document} from any valid XML file by stripping the XML tags to extract and
!  *  index the content. The full content of all Elements and Attributes is indexed in the default and
!  *  admindefault fields and is stemmed and indexed in the stems field. The reader for this type of Document is
!  *  XMLDocReader.
   *
   * @author    John Weatherley

Index: FileIndexingServiceWriter.java
===================================================================
RCS file: /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer/FileIndexingServiceWriter.java,v
retrieving revision 1.51
retrieving revision 1.52
diff -C2 -d -r1.51 -r1.52
*** FileIndexingServiceWriter.java	12 Feb 2010 02:13:10 -0000	1.51
--- FileIndexingServiceWriter.java	16 Jun 2010 19:11:18 -0000	1.52
***************
*** 14,18 ****
   *  All rights reserved.
   */
- 
  package org.dlese.dpc.index.writer;

--- 14,17 ----
***************
*** 73,79 ****
  	private static boolean debug = false;
  	private boolean validateFiles = false;
- 	private StringBuffer defaultBuffer = null;
- 	private StringBuffer adminDefaultBuffer = null;
- 	private String adminDefaultFieldName = "admindefault";
  	private boolean abortIndexing = false;
  	private FileIndexingService fileIndexingService = null;
--- 72,75 ----
***************
*** 81,85 ****
  	private File dir = null;
  	private String fileContent = null;
! 	private Document existingDoc = null;
  	private FileIndexingServiceData newDocData = null;
  	private FileIndexingPlugin fileIndexingPlugin = null;
--- 77,82 ----
  	private File dir = null;
  	private String fileContent = null;
! 	private Document luceneDoc = null;
! 	private Document previousRecordDoc = null;
  	private FileIndexingServiceData newDocData = null;
  	private FileIndexingPlugin fileIndexingPlugin = null;
***************
*** 93,99 ****
  	/**
  	 *  Gets a unique document type key for this kind of record, corresponding to the format type. In the DLESE
! 	 *  metadata repository, this corresponds to the XML format, for example "oai_dc," "adn," "dlese_ims," or "dlese_anno".
! 	 *  The string is parsed using the Lucene {@link org.apache.lucene.analysis.standard.StandardAnalyzer} so it
! 	 *  must be lowercase and should not contain any stop words.
  	 *
  	 * @return                The docType String
--- 90,96 ----
  	/**
  	 *  Gets a unique document type key for this kind of record, corresponding to the format type. In the DLESE
! 	 *  metadata repository, this corresponds to the XML format, for example "oai_dc," "adn," "dlese_ims," or
! 	 *  "dlese_anno". The string is parsed using the Lucene {@link org.apache.lucene.analysis.standard.StandardAnalyzer}
! 	 *  so it must be lowercase and should not contain any stop words.
  	 *
  	 * @return                The docType String
***************
*** 130,138 ****
  	 *
  	 * @param  source         The source file being indexed
! 	 * @param  existingDoc    An existing Document that currently resides in the index for the given resource, or
  	 *      null if none was previously present
  	 * @exception  Exception  If an error occured during set-up.
  	 */
! 	public abstract void init(File source, Document existingDoc) throws Exception;

--- 127,135 ----
  	 *
  	 * @param  source         The source file being indexed
! 	 * @param  previousRecordDoc    An existing Document that currently resides in the index for the given resource, or
  	 *      null if none was previously present
  	 * @exception  Exception  If an error occured during set-up.
  	 */
! 	public abstract void init(File source, Document previousRecordDoc) throws Exception;

***************
*** 156,160 ****
  	 *
  	 *  Example code:<br>
! 	 *  <code>protected void addCustomFields(Document newDoc, Document existingDoc) throws Exception {</code>
  	 *  <br>
  	 *  &nbsp;<code> String customContent = "Some content";</code><br>
--- 153,157 ----
  	 *
  	 *  Example code:<br>
! 	 *  <code>protected void addCustomFields(Document newDoc, Document previousRecordDoc) throws Exception {</code>
  	 *  <br>
  	 *  &nbsp;<code> String customContent = "Some content";</code><br>
***************
*** 164,168 ****
  	 * @param  newDoc         The new {@link org.apache.lucene.document.Document} that is being created for this
  	 *      resource
! 	 * @param  existingDoc    An existing {@link org.apache.lucene.document.Document} that currently resides in
  	 *      the index for the given resource, or null if none was previously present
  	 * @param  sourceFile     The sourceFile that is being indexed
--- 161,165 ----
  	 * @param  newDoc         The new {@link org.apache.lucene.document.Document} that is being created for this
  	 *      resource
! 	 * @param  previousRecordDoc    An existing {@link org.apache.lucene.document.Document} that currently resides in
  	 *      the index for the given resource, or null if none was previously present
  	 * @param  sourceFile     The sourceFile that is being indexed
***************
*** 170,174 ****
  	 *      occurs.
  	 */
! 	protected abstract void addCustomFields(Document newDoc, Document existingDoc, File sourceFile) throws Exception;

--- 167,171 ----
  	 *      occurs.
  	 */
! 	protected abstract void addCustomFields(Document newDoc, Document previousRecordDoc, File sourceFile) throws Exception;

***************
*** 185,199 ****
  		if (fileContent == null) {
  			if (isMakingDeletedDoc())
! 				fileContent = existingDoc.get("filecontent");
  			else if (source.exists())
  				fileContent = Files.readFileToEncoding(source, "UTF-8").toString();
! 			if (fileContent == null && existingDoc != null)
! 				fileContent = existingDoc.get("filecontent");
  			if (fileContent == null)
  				fileContent = "";
! 			
  			// Remove the BOM character, if present, since it crashes the XML processors (Windows notepad places it there when saving files as UTF-8):
! 			if(fileContent.startsWith("\uFEFF"))
! 				fileContent = fileContent.replaceFirst("\uFEFF","");
  		}
  		return fileContent;
--- 182,196 ----
  		if (fileContent == null) {
  			if (isMakingDeletedDoc())
! 				fileContent = previousRecordDoc.get("filecontent");
  			else if (source.exists())
  				fileContent = Files.readFileToEncoding(source, "UTF-8").toString();
! 			if (fileContent == null && previousRecordDoc != null)
! 				fileContent = previousRecordDoc.get("filecontent");
  			if (fileContent == null)
  				fileContent = "";
! 
  			// Remove the BOM character, if present, since it crashes the XML processors (Windows notepad places it there when saving files as UTF-8):
! 			if (fileContent.startsWith("\uFEFF"))
! 				fileContent = fileContent.replaceFirst("\uFEFF", "");
  		}
  		return fileContent;
***************
*** 265,275 ****
  		return dir;
  	}
! 
! 
  	/**
! 	 * @return    The existingDoc value
  	 */
! 	public Document getExistingDoc() {
! 		return existingDoc;
  	}

--- 262,285 ----
  		return dir;
  	}
! 	
  	/**
! 	 *  Gets the Lucene Document that this Writer is building.
! 	 *
! 	 * @return    The Lucene Document
  	 */
! 	public Document getLuceneDoc() {
! 		if(luceneDoc == null)
! 			luceneDoc = new Document();
! 		return luceneDoc;
! 	}	
! 	
! 	/**
! 	 *  Gets the previous Document that currently resides in the index for the given resource, or null if none was
! 	 *  previously present.
! 	 *
! 	 * @return    The previousRecordDoc value
! 	 */
! 	public Document getPreviousRecordDoc() {
! 		return previousRecordDoc;
  	}

***************
*** 335,339 ****
  	}

- 
  	/**
  	 *  Adds the given String to the 'default' and 'stems' fields as text and stemmed text, respectively. The
--- 345,348 ----
***************
*** 345,352 ****
  	protected void addToDefaultField(String value) {
  		if (value != null && value.trim().length() > 0)
! 			defaultBuffer.append(value).append(IndexingTools.PHRASE_SEPARATOR);
  	}

- 
  	/**
  	 *  Adds the given String to a text field referenced in the index by the field name 'admindefault'. The
--- 354,360 ----
  	protected void addToDefaultField(String value) {
  		if (value != null && value.trim().length() > 0)
! 			IndexingTools.addToDefaultAndStemsFields(getLuceneDoc(), value);
  	}

  	/**
  	 *  Adds the given String to a text field referenced in the index by the field name 'admindefault'. The
***************
*** 358,365 ****
  	protected void addToAdminDefaultField(String value) {
  		if (value != null && value.trim().length() > 0)
! 			adminDefaultBuffer.append(value).append(IndexingTools.PHRASE_SEPARATOR);
  	}

- 
  	/**
  	 *  Creates a Lucene {@link org.apache.lucene.document.Document} equal to the exsiting FileIndexingService
--- 366,372 ----
  	protected void addToAdminDefaultField(String value) {
  		if (value != null && value.trim().length() > 0)
! 			IndexingTools.addToAdminDefaultField(getLuceneDoc(), value);
  	}

  	/**
  	 *  Creates a Lucene {@link org.apache.lucene.document.Document} equal to the exsiting FileIndexingService
***************
*** 371,387 ****
  	 *  #getIsMakingDeletedDoc} to execute as appropriate.
  	 *
! 	 * @param  existingDoc    An existing FileIndexingService Document that currently resides in the index for
  	 *      the given file
  	 * @return                A Lucene FileIndexingService Document with appropriate fields updated
  	 * @exception  Throwable  Thrown if error occurs
  	 */
! 	public synchronized Document getDeletedDoc(Document existingDoc)
  		 throws Throwable {

! 		if (existingDoc == null)
  			throw new Exception("getDeletedDoc(): the existing doc is null");
  		isMakingDeletedDoc = true;

! 		return existingDoc;
  	}

--- 378,394 ----
  	 *  #getIsMakingDeletedDoc} to execute as appropriate.
  	 *
! 	 * @param  previousRecordDoc    An existing FileIndexingService Document that currently resides in the index for
  	 *      the given file
  	 * @return                A Lucene FileIndexingService Document with appropriate fields updated
  	 * @exception  Throwable  Thrown if error occurs
  	 */
! 	public synchronized Document getDeletedDoc(Document previousRecordDoc)
  		 throws Throwable {

! 		if (previousRecordDoc == null)
  			throw new Exception("getDeletedDoc(): the existing doc is null");
  		isMakingDeletedDoc = true;

! 		return previousRecordDoc;
  	}

***************
*** 470,474 ****
  			if (sourceFile != null)
  				this.dir = source.getParentFile();
! 			this.existingDoc = existingLuceneDoc;

  			/*
--- 477,481 ----
  			if (sourceFile != null)
  				this.dir = source.getParentFile();
! 			this.previousRecordDoc = existingLuceneDoc;

  			/*
***************
*** 478,487 ****
  		 *  correct DocReader
  		 */
! 			defaultBuffer = new StringBuffer();
! 			adminDefaultBuffer = new StringBuffer();
  			if (newDocData == null)
  				newDocData = new FileIndexingServiceData();
  			newDocData.clearAll();
! 			init(source, existingDoc);
  			if (abortIndexing) {
  				destroy();
--- 485,493 ----
  		 *  correct DocReader
  		 */
! 
  			if (newDocData == null)
  				newDocData = new FileIndexingServiceData();
  			newDocData.clearAll();
! 			init(source, previousRecordDoc);
  			if (abortIndexing) {
  				destroy();
***************
*** 489,493 ****
  			}

! 			Document doc = new Document();

  			// A field/term that matches all records:
--- 495,499 ----
  			}

! 			Document doc = getLuceneDoc();

  			// A field/term that matches all records:
***************
*** 537,543 ****
  			doc.add(new Field("readerclass", getReaderClass(), Field.Store.YES, Field.Index.TOKENIZED));

! 			// ------- Additional fields that are unique to the framework being indexed -------

! 			addCustomFields(doc, existingDoc, source);

  			// ------- Add file validation information, if apporpriate -------
--- 543,549 ----
  			doc.add(new Field("readerclass", getReaderClass(), Field.Store.YES, Field.Index.TOKENIZED));

! 			// ------- Index all standard and custom fields for the framework being indexed by subclasses -------

! 			addCustomFields(doc, previousRecordDoc, source);

  			// ------- Add file validation information, if apporpriate -------
***************
*** 560,567 ****

  			// Index the 'default' and 'stems' fields. See class JavaDoc for details on this field.
! 			IndexingTools.indexDefaultAndStemsFields(doc, defaultBuffer.toString());

  			// Admin default field
! 			doc.add(new Field(adminDefaultFieldName, adminDefaultBuffer.toString(), Field.Store.NO, Field.Index.TOKENIZED));

  			if (abortIndexing) {
--- 566,573 ----

  			// Index the 'default' and 'stems' fields. See class JavaDoc for details on this field.
! 			//IndexingTools.indexDefaultAndStemsFields(doc, defaultBuffer.toString());

  			// Admin default field
! 			//doc.add(new Field(adminDefaultFieldName, adminDefaultBuffer.toString(), Field.Store.NO, Field.Index.TOKENIZED));

  			if (abortIndexing) {

Index: IndexingTools.java
===================================================================
RCS file: /cvsroot/dlsciences/dlese-tools-project/src/org/dlese/dpc/index/writer/IndexingTools.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** IndexingTools.java	20 Mar 2009 23:33:53 -0000	1.10
--- IndexingTools.java	16 Jun 2010 19:11:18 -0000	1.11
***************
*** 14,18 ****
   *  All rights reserved.
   */
- 
  package org.dlese.dpc.index.writer;

--- 14,17 ----
***************
*** 36,42 ****
  public class IndexingTools {

! 	private static final String defaultFieldName = "default";
! 	private static final String stemsFieldName = "stems";
! 	
  	/**
  	 *  String used to separate and preserve phrases indexed as text, includes leading and trailing white space.
--- 35,45 ----
  public class IndexingTools {

! 	/**  Default field 'default' */
! 	public final static String defaultFieldName = "default";
! 	/**  Stems field 'stems' */
! 	public final static String stemsFieldName = "stems";
! 	/**  Admin default field 'admindefault' */
! 	public final static String adminDefaultFieldName = "admindefault";
! 
  	/**
  	 *  String used to separate and preserve phrases indexed as text, includes leading and trailing white space.
***************
*** 45,56 ****

! 	public final static void indexDefaultAndStemsFields(Document myDoc, String content) {
  		// See class JavaDoc for details on this field.
! 		myDoc.add(new Field(defaultFieldName, content,Field.Store.NO, Field.Index.TOKENIZED));
  		// Note that the Analyzer will handle converting the tokens in this field to their stem form.
! 		myDoc.add(new Field(stemsFieldName, content,Field.Store.NO, Field.Index.TOKENIZED));
  	}
! 			
! 	
  	/**
  	 *  Creates a String separated by the phrase separator term from the text of each of the Element or
--- 48,76 ----

! 	/**
! 	 *  Indexes the given text into the default and stems fields.
! 	 *
! 	 * @param  myDoc    Document to add to
! 	 * @param  content  Content to add
! 	 */
! 	public final static void addToDefaultAndStemsFields(Document myDoc, String content) {
  		// See class JavaDoc for details on this field.
! 		myDoc.add(new Field(defaultFieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
  		// Note that the Analyzer will handle converting the tokens in this field to their stem form.
! 		myDoc.add(new Field(stemsFieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
  	}
! 
! 
! 	/**
! 	 *  Indexes the given text into the admin default field.
! 	 *
! 	 * @param  myDoc    Document to add to
! 	 * @param  content  Content to add
! 	 */
! 	public final static void addToAdminDefaultField(Document myDoc, String content) {
! 		myDoc.add(new Field(adminDefaultFieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
! 	}
! 
! 
  	/**
  	 *  Creates a String separated by the phrase separator term from the text of each of the Element or
***************
*** 267,279 ****
  	 *  to the field, the values are Lists of the terms in that field.
  	 *
! 	 * @param  textToParse   The text to analyze with the analyzer
! 	 * @param  analyzer      The analyzer to use
! 	 * @param  field         DESCRIPTION
! 	 * @return               The terms HashMap generated by the analyzer
  	 */
  	/* public final static HashMap getAnalyzedTermsHashMap(String textToParse, String defaultField, Analyzer analyzer) {
  		Reader reader = new StringReader(textToParse);
  		TokenStream in = analyzer.tokenStream((defaultField == null ? "default" : defaultField), reader);
- 
  		HashMap map = new HashMap();
  		ArrayList termList = new ArrayList();
--- 287,298 ----
  	 *  to the field, the values are Lists of the terms in that field.
  	 *
! 	 * @param  textToParse  The text to analyze with the analyzer
! 	 * @param  analyzer     The analyzer to use
! 	 * @param  field        DESCRIPTION
! 	 * @return              The terms HashMap generated by the analyzer
  	 */
  	/* public final static HashMap getAnalyzedTermsHashMap(String textToParse, String defaultField, Analyzer analyzer) {
  		Reader reader = new StringReader(textToParse);
  		TokenStream in = analyzer.tokenStream((defaultField == null ? "default" : defaultField), reader);
  		HashMap map = new HashMap();
  		ArrayList termList = new ArrayList();
***************
*** 291,295 ****
  		return (String[]) termList.toArray(new String[]{});
  	} */
- 
  	/**
  	 *  Creates a StringBuffer to display the tokens created by a given analyzer. Output is of the form: [token1]
--- 310,313 ----

[Dlsciences-dlese-tools] dlese-tools-project/src/org/dlese/dpc/index/writer FileIndexingServiceWri

[Dlsciences-dlese-tools] dlese-tools-project/src/org/dlese/dpc/index/writer FileIndexingServiceWriter.java, 1.51, 1.52, IndexingTools.java, 1.10, 1.11, SimpleXMLFileIndexingWriter.java, 1.20, 1.21,