// Get contents
Element bodyElement = source.getNextElement(0,
HTMLElementName.BODY);
String contents;
if (bodyElement != null) {
contents =
bodyElement.getContent().getTextExtractor().toString();
}
else {
/
* Certain HTML-like files don't have a body, but
contain
* extractable text, e.g. Chrome bookmark files.
Such files can
* indexed by rendering them and processing the
resulting text. /
contents =
source.getRenderer().setIncludeHyperlinkURLs(false)
.toString();
}
return new ParseResult(contents)
.setTitle(title)
.addAuthor(author)
.addMiscMetadata(description)
.addMiscMetadata(keywords);
}
Source source = getSource(file);
text =
source.getRenderer().setIncludeHyperlinkURLs(false).toString();
if (text == null || text.equals(""))
return super.renderText(file, filename);
else
return text;
}
/*
* Returns the value of the meta tag with the given name in the
specified
* HTML source. Returns null if the meta tag does not exist. /
@Nullable
private String getMetaValue(@NotNull Source source, @NotNull String
key) {
int pos = 0;
while (pos < source.length()) {
StartTag startTag = source.getNextStartTag(pos, "name",
key, false); //$NON-NLS-1$
if (startTag == null) return null;
if (startTag.getName() == HTMLElementName.META)
return startTag.getAttributeValue("content"); //$NON-NLS-1$
pos = startTag.getEnd();
}
return null;
}
/*
* Returns a {@code Source} for the index.html. /
@NotNull
private static Source getSource(@NotNull File file)
throws ParseException {
try {
ZipFile zipfile = new ZipFile(file);
boolean found = false;
Enumeration<zipentry> entries = (Enumeration<zipentry>)
zipfile.entries();
ZipEntry entry = null;
while(entries.hasMoreElements()) {
entry = entries.nextElement();
if(entry.getName().endsWith("index.html") ||
entry.getName().endsWith("index.xhtml")) {
found = true;
break;
}
}
if(!found) {
zipfile.close();
throw new ParseException(file.getName().concat(" not a
maf archive"));
}
InputStream in = zipfile.getInputStream(entry);
Source source = new Source(in);
source.setLogger(null);
source.fullSequentialParse();
zipfile.close();
return source;
} catch (ZipException e) {
throw new ParseException(e);
} catch (IOException e) {
throw new ParseException(e);
}
}</zipentry></zipentry>
this is a code contribution to support the maff (Mozilla archive format)
https://en.wikipedia.org/wiki/Mozilla_Archive_Format
maff format is a zip file in which the web page contents is saved within
the zip file.
http://maf.mozdev.org/maff-specification.html
the main parser codes:
src/net/sourceforge/docfetcher/model/parse/MaffParser.java
package net.sourceforge.docfetcher.model.parse;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.sourceforge.docfetcher.enums.Msg;
import net.sourceforge.docfetcher.util.annotations.NotNull;
import net.sourceforge.docfetcher.util.annotations.Nullable;
final class MaffParser extends FileParser {
private static final Collection<string> extensions = Arrays.asList(
"maf", "maff");
private static final Collection<string> types = Arrays.asList(
MediaType.text("html"),
MediaType.application("zip"));</string></string>
public MaffParser() {
}
@Override
protected ParseResult parse(File file, ParseContext context)
throws ParseException {
Source source = getSource(file);
// Get tags
Element titleElement = source.getNextElement(0,
HTMLElementName.TITLE);
String title = titleElement == null ?
"" :
CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
String author = getMetaValue(source, "author");
String description = getMetaValue(source, "description");
String keywords = getMetaValue(source, "keywords");
// Get contents
Element bodyElement = source.getNextElement(0,
HTMLElementName.BODY);
String contents;
if (bodyElement != null) {
contents =
bodyElement.getContent().getTextExtractor().toString();
}
else {
/
* Certain HTML-like files don't have a body, but
contain
* extractable text, e.g. Chrome bookmark files.
Such files can
* indexed by rendering them and processing the
resulting text.
/
contents =
source.getRenderer().setIncludeHyperlinkURLs(false)
.toString();
}
return new ParseResult(contents)
.setTitle(title)
.addAuthor(author)
.addMiscMetadata(description)
.addMiscMetadata(keywords);
}
@Override
protected String renderText(File file, String filename)
throws ParseException {
String text = "";
Source source = getSource(file);
text =
source.getRenderer().setIncludeHyperlinkURLs(false).toString();
if (text == null || text.equals(""))
return super.renderText(file, filename);
else
return text;
}
/*
* Returns the value of the meta tag with the given name in the
specified
* HTML source. Returns null if the meta tag does not exist.
/
@Nullable
private String getMetaValue(@NotNull Source source, @NotNull String
key) {
int pos = 0;
while (pos < source.length()) {
StartTag startTag = source.getNextStartTag(pos, "name",
key, false); //$NON-NLS-1$
if (startTag == null) return null;
if (startTag.getName() == HTMLElementName.META)
return startTag.getAttributeValue("content"); //$NON-NLS-1$
pos = startTag.getEnd();
}
return null;
}
/*
* Returns a {@code Source} for the index.html.
/
@NotNull
private static Source getSource(@NotNull File file)
throws ParseException {
try {
ZipFile zipfile = new ZipFile(file);
boolean found = false;
Enumeration<zipentry> entries = (Enumeration<zipentry>)
zipfile.entries();
ZipEntry entry = null;
while(entries.hasMoreElements()) {
entry = entries.nextElement();
if(entry.getName().endsWith("index.html") ||
entry.getName().endsWith("index.xhtml")) {
found = true;
break;
}
}
if(!found) {
zipfile.close();
throw new ParseException(file.getName().concat(" not a
maf archive"));
}
InputStream in = zipfile.getInputStream(entry);
Source source = new Source(in);
source.setLogger(null);
source.fullSequentialParse();
zipfile.close();
return source;
} catch (ZipException e) {
throw new ParseException(e);
} catch (IOException e) {
throw new ParseException(e);
}
}</zipentry></zipentry>
@Override
protected Collection<string> getExtensions() {
return extensions;
}</string>
@Override
protected Collection<string> getTypes() {
return types;
}</string>
@Override
public String getTypeLabel() {
return Msg.filetype_maf.get();
}
}