andrew goh - 2020-02-27

this is a code contribution to support the maff (Mozilla archive format)

https://en.wikipedia.org/wiki/Mozilla_Archive_Format

maff format is a zip file in which the web page contents is saved within
the zip file.

http://maf.mozdev.org/maff-specification.html

the main parser codes:

src/net/sourceforge/docfetcher/model/parse/MaffParser.java

package net.sourceforge.docfetcher.model.parse;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;

import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.sourceforge.docfetcher.enums.Msg;
import net.sourceforge.docfetcher.util.annotations.NotNull;
import net.sourceforge.docfetcher.util.annotations.Nullable;

final class MaffParser extends FileParser {

private static final Collection<string> extensions = Arrays.asList(
            "maf", "maff");
    private static final Collection<string> types = Arrays.asList(
            MediaType.text("html"),
            MediaType.application("zip"));</string></string>

public MaffParser() {
    }

@Override
    protected ParseResult parse(File file, ParseContext context)
            throws ParseException {
        Source source = getSource(file);

// Get tags
                Element titleElement = source.getNextElement(0,
HTMLElementName.TITLE);
                String title = titleElement == null ?
                        "" :
CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
                String author = getMetaValue(source, "author");
                String description = getMetaValue(source, "description");
                String keywords = getMetaValue(source, "keywords");

// Get contents
                Element bodyElement = source.getNextElement(0,
HTMLElementName.BODY);
                String contents;
                if (bodyElement != null) {
                    contents =
bodyElement.getContent().getTextExtractor().toString();
                }
                else {
                    /
                     * Certain HTML-like files don't have a body, but
contain
                     * extractable text, e.g. Chrome bookmark files.
Such files can
                     * indexed by rendering them and processing the
resulting text.
/
                    contents =
source.getRenderer().setIncludeHyperlinkURLs(false)
                            .toString();
                }

return new ParseResult(contents)
                    .setTitle(title)
                    .addAuthor(author)
                    .addMiscMetadata(description)
                    .addMiscMetadata(keywords);
    }

@Override
    protected String renderText(File file, String filename)
            throws ParseException {
            String text = "";

Source source = getSource(file);
            text =
source.getRenderer().setIncludeHyperlinkURLs(false).toString();
            if (text == null || text.equals(""))
                return super.renderText(file, filename);
            else
                return text;
    }

/*
     * Returns the value of the meta tag with the given name in the
specified
     * HTML source. Returns null if the meta tag does not exist.
/
    @Nullable
    private String getMetaValue(@NotNull Source source, @NotNull String
key) {
        int pos = 0;
        while (pos < source.length()) {
            StartTag startTag = source.getNextStartTag(pos, "name",
key, false); //$NON-NLS-1$
            if (startTag == null) return null;
            if (startTag.getName() == HTMLElementName.META)
                return startTag.getAttributeValue("content"); //$NON-NLS-1$
            pos = startTag.getEnd();
        }
        return null;
    }

/*
     * Returns a {@code Source} for the index.html.
/
    @NotNull
    private static Source getSource(@NotNull File file)
            throws ParseException {
        try {
            ZipFile zipfile = new ZipFile(file);
            boolean found = false;
            Enumeration<zipentry> entries = (Enumeration<zipentry>)
zipfile.entries();
            ZipEntry entry = null;
            while(entries.hasMoreElements()) {
                entry = entries.nextElement();
                if(entry.getName().endsWith("index.html") ||
entry.getName().endsWith("index.xhtml")) {
                    found = true;
                    break;
                }
            }
            if(!found) {
                zipfile.close();
                throw new ParseException(file.getName().concat(" not a
maf archive"));
            }
            InputStream in = zipfile.getInputStream(entry);
            Source source = new Source(in);
            source.setLogger(null);
            source.fullSequentialParse();
            zipfile.close();
            return source;
        } catch (ZipException e) {
            throw new ParseException(e);
        } catch (IOException e) {
            throw new ParseException(e);
        }
    }</zipentry></zipentry>

@Override
    protected Collection<string> getExtensions() {
        return extensions;
    }</string>

@Override
    protected Collection<string> getTypes() {
        return types;
    }</string>

@Override
    public String getTypeLabel() {
        return Msg.filetype_maf.get();
    }

}