[Archive-access-cvs] archive-access/projects/wera/src/java/no/nb/nwa/retriever AID.java,NONE,1.1 ARC

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/archive-access/archive-access/projects/wera/src/java/no/nb/nwa/retriever
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv1719/src/java/no/nb/nwa/retriever

Added Files:
	AID.java ARCRetriever.java ArcRetrieverException.java 
Log Message:
Added building of the arcretriever war file.


--- NEW FILE: AID.java ---
/*
 *  This file is part of The NWA Toolset.
 *
 *  Copyright (C) 2001-2002 Royal Library in Stockholm,
 *                          Royal Library in Copenhagen,
 *                          Helsinki University Library of Finland, 
 *                          National Library of Norway,
 *                          National and University Library of Iceland.
 *
 *  The NWA Toolset is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  The NWA Toolset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with The NWA Toolset; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

package no.nb.nwa.retriever;

/**
 * 
 * @author John Erik Halse
 *  
 */
public class AID {

    private final String aid;

    private final String filename;

    private final long offset;

    /**
     * @throws ArcRetrieverException
     *  
     */
    public AID(String aid) throws ArcRetrieverException {
        try {
            this.aid = aid;
            int filenameOffset = aid.indexOf('/');
            this.filename = aid.substring(filenameOffset);
            this.offset = Long.parseLong(aid.substring(0, filenameOffset));
        } catch (StringIndexOutOfBoundsException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_UNABLE_TO_PARSE_ARCHIVE_IDENTIFIER);
        } catch (Exception e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_UNABLE_TO_PARSE_ARCHIVE_IDENTIFIER,
                    e);
        }
    }

    /**
     * @return Returns the filename.
     */
    public String getFilename() {
        return filename;
    }

    /**
     * @return Returns the offset.
     */
    public long getOffset() {
        return offset;
    }

    public String toString() {
        return aid;
    }
}

--- NEW FILE: ARCRetriever.java ---
/*
 *  This file is part of The NWA Toolset.
 *
 *  Copyright (C) 2001-2002 Royal Library in Stockholm,
 *                          Royal Library in Copenhagen,
 *                          Helsinki University Library of Finland, 
 *                          National Library of Norway,
 *                          National and University Library of Iceland.
 *
 *  The NWA Toolset is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  The NWA Toolset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with The NWA Toolset; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package no.nb.nwa.retriever;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HeaderGroup;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.util.ArchiveUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;

/**
 * 
 * @author John Erik Halse
 *  
 */
public class ARCRetriever extends HttpServlet {

    final static Pattern charsetPattern = Pattern
            .compile("^.*charset=([^\\s]+).*$");

    /**
     *  
     */
    public ARCRetriever() {
    }

    protected void doGet(HttpServletRequest request,
            HttpServletResponse response) throws ServletException, IOException {
        try {
            long now = System.currentTimeMillis();
            String reqtype = request.getParameter("reqtype");
            String aid = request.getParameter("aid");

            if (reqtype == null) {
                throw new ArcRetrieverException(
                        ArcRetrieverException.ERROR_REQTYPE_MISSING);
            } else {
                reqtype = reqtype.intern();
            }

            ARCRetriever retriever = new ARCRetriever();
            if (reqtype == "getfile") {
                if (aid == null) {
                    throw new ArcRetrieverException(
                            ArcRetrieverException.ERROR_ARCHIVE_IDENTIFIER_MISSING);
                } else {
                    retriever.getDocument(response, new AID(aid));
                }
            } else if (reqtype == "getmeta") {
                if (aid == null) {
                    throw new ArcRetrieverException(
                            ArcRetrieverException.ERROR_ARCHIVE_IDENTIFIER_MISSING);
                } else {
                    retriever.getMeta(response, new AID(aid));
                }
            } else if (reqtype == "getfilestatus") {
                if (aid == null) {
                    throw new ArcRetrieverException(
                            ArcRetrieverException.ERROR_ARCHIVE_IDENTIFIER_MISSING);
                } else {
                    retriever.getFileStatus(response, new AID(aid));
                }
            } else if (reqtype == "getarchiveinfo") {
                retriever.getArchiveInfo(response);
            } else {
                throw new ArcRetrieverException(
                        ArcRetrieverException.ERROR_UNSUPPORTED_REQTYPE);
            }
        } catch (Throwable e) {
            handleException(response, e);
        }
    }

    protected void doPost(HttpServletRequest request,
            HttpServletResponse response) throws ServletException, IOException {
        this.doGet(request, response);
    }

    public void getFileStatus(HttpServletResponse response, AID aid)
            throws ArcRetrieverException {
        response.setContentType("text/xml; charset=UTF-8");

        String status = "";
        String status_long = "";
        File file = new File(aid.getFilename());
        ARCReader arc = null;
        try {
            arc = ARCReaderFactory.get(file);
            ARCRecord rec = null;
            try {
                rec = arc.get(aid.getOffset());
                status = "online";
                status_long = "Document is available.";
            } catch (IOException e) {
                status = "non-existent";
                status_long = "No document at offset: " + aid.getOffset();
            } finally {
                if (rec != null) rec.close();
            }
        } catch (IOException e) {
            status = "non-existent";
            //            status_long = "File '" + file.getAbsolutePath()
            status_long = "File '" + aid.getFilename()
                    + "' doesn't exist or is not an ARC file.";
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (arc != null) arc.close();
            } catch (IOException e) {
            }
        }

        try {
            Document dom = DocumentBuilderFactory.newInstance()
                    .newDocumentBuilder().newDocument();
            Node msg = dom.appendChild(dom.createElement("retrievermessage"));
            Node head = msg.appendChild(dom.createElement("head"));
            addTextElement(head, "reqtype", "getfilestatus");
            addTextElement(head, "aid", aid.toString());
            Node body = msg.appendChild(dom.createElement("body"));
            addTextElement(body, "filestatus", status);
            addTextElement(body, "filestatus_long", status_long);

            Transformer transformer = TransformerFactory.newInstance()
                    .newTransformer();
            transformer.setOutputProperty("indent", "yes");
            Result res = new StreamResult(response.getOutputStream());
            Source source = new DOMSource(dom);
            transformer.transform(source, res);
        } catch (ParserConfigurationException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (TransformerConfigurationException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (IOException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (TransformerException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        }
    }

    public void getArchiveInfo(HttpServletResponse response)
            throws ArcRetrieverException {
        response.setContentType("text/xml; charset=UTF-8");

        try {
            Document dom = DocumentBuilderFactory.newInstance()
                    .newDocumentBuilder().newDocument();
            Node msg = dom.appendChild(dom.createElement("retrievermessage"));
            Node head = msg.appendChild(dom.createElement("head"));
            addTextElement(head, "reqtype", "getarchiveinfo");
            Node body = msg.appendChild(dom.createElement("body"));
            addTextElement(body, "info", "ArcRetriever");

            Transformer transformer = TransformerFactory.newInstance()
                    .newTransformer();
            transformer.setOutputProperty("indent", "yes");
            Result res = new StreamResult(response.getOutputStream());
            Source source = new DOMSource(dom);
            transformer.transform(source, res);
        } catch (ParserConfigurationException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (TransformerConfigurationException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (IOException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (TransformerException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        }
    }

    public void getMeta(HttpServletResponse response, AID aid)
            throws ArcRetrieverException {
        ARCReader arc = null;
        ARCRecord rec = null;
        OutputStream out = null;
        try {
            File file = new File(aid.getFilename());
            try {
                arc = ARCReaderFactory.get(file);
                rec = arc.get(aid.getOffset());
            } catch (IOException e) {
                throw new ArcRetrieverException(
                        ArcRetrieverException.ERROR_OBJECT_NOT_ACCESSIBLE);
            } catch (Exception e) {
                throw new ArcRetrieverException(
                        ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
            }
            ARCRecordMetaData meta = rec.getMetaData();
            out = response.getOutputStream();
            HeaderGroup headers = new HeaderGroup();
            headers.setHeaders(rec.getHttpHeaders());

            response.setContentType("text/xml; charset=UTF-8");

            Document dom = DocumentBuilderFactory.newInstance()
                    .newDocumentBuilder().newDocument();
            Node msg = dom.appendChild(dom.createElement("retrievermessage"));
            Node head = msg.appendChild(dom.createElement("head"));
            addTextElement(head, "reqtype", "getmeta");
            addTextElement(head, "aid", aid.toString());
            Node body = msg.appendChild(dom.createElement("body"));
            Node metadata = body.appendChild(dom.createElement("metadata"));
            addTextElement(metadata, "url", meta.getUrl());
            String arcDate = meta.getDate();
            addTextElement(metadata, "archival_time", arcDate);

            try {
                // Trying to parse dates in the following format:
                // Mon, 14 Jun 2004 11:13:02 GMT
                DateFormat df = new SimpleDateFormat(
                        "E, d MMM yyyy HH:mm:ss z", Locale.US);
                String lastModDate = getHttpHeader(headers, "last-modified");
                lastModDate = ArchiveUtils.get14DigitDate(df.parse(lastModDate)
                        .getTime());
                addTextElement(metadata, "last_modified_time", lastModDate);
            } catch (ParseException e) {
                addTextElement(metadata, "last_modified_time", arcDate);
            } catch (NullPointerException e) {
                addTextElement(metadata, "last_modified_time", arcDate);
            }
            addTextElement(metadata, "content_length", getHttpHeader(headers,
                    "content-length"));
            Node contenttype = metadata.appendChild(dom
                    .createElement("contenttype"));
            addTextElement(contenttype, "type", meta.getMimetype());
            String contentTypeString = getHttpHeader(headers, "content-type");
            String charset = "";
            if (contentTypeString != null) {
                Matcher m = charsetPattern.matcher(contentTypeString
                        .toLowerCase());
                if (m.matches()) {
                    charset = m.group(1);
                }
            }
            addTextElement(contenttype, "charset", charset);
            addTextElement(metadata, "filestatus", "online");
            addTextElement(metadata, "filestatus_long", "");
            // TODO: Fix.
            // String header = rec.getHttpHeaderString().replaceAll("\r", "");
            String header = "UNIMPLEMENTED-TODO";
            //remove illegal XML-characters
            header = header.replaceAll("[\\p{Cc}&&[^\\u0009\\u000A\\u000D]]+",
                    "???");
            header = header.trim();
            rec.close();
            addTextElement(metadata, "content_checksum", meta.getDigest());
            arc.close();

            addTextElement(metadata, "http-header", header);

            Transformer transformer = TransformerFactory.newInstance()
                    .newTransformer();
            transformer.setOutputProperty(OutputKeys.INDENT, "yes");
            transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            Result res = new StreamResult(out);
            Source source = new DOMSource(dom);
            transformer.transform(source, res);
        } catch (ParserConfigurationException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (TransformerConfigurationException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (IOException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } catch (TransformerException e) {
            throw new ArcRetrieverException(
                    ArcRetrieverException.ERROR_BAD_FUNCTION_ARGUMENT, e);
        } finally {
            try {
                if (out != null) out.close();
            } catch (IOException e) {
            }
        }
    }

    private String getHttpHeader(HeaderGroup headers, String headerName) {
        Header header = headers.getCondensedHeader(headerName);
        return header == null ? "" : header.getValue();
    }

    private void addTextElement(Node parent, String elementName, String value) {
        value = value == null ? "" : value;
        Document dom = parent.getOwnerDocument();
        parent.appendChild(dom.createElement(elementName)).appendChild(
                dom.createTextNode(value));
    }

    private void addCDataElement(Node parent, String elementName, String value) {
        value = value == null ? "" : value;
        Document dom = parent.getOwnerDocument();
        parent.appendChild(dom.createElement(elementName)).appendChild(
                dom.createCDATASection(value));
    }

    public void getDocument(HttpServletResponse response, AID aid)
            throws Exception {
        OutputStream out = response.getOutputStream();
        ARCRecord rec = null;
        ARCReader arc = null;
        File file = new File(aid.getFilename());
        arc = ARCReaderFactory.get(file);
        rec = arc.get(aid.getOffset());
        ARCRecordMetaData meta = rec.getMetaData();
        rec.skipHttpHeader();
        HeaderGroup headers = new HeaderGroup();
        headers.setHeaders(rec.getHttpHeaders());
        String contentTypeString = getHttpHeader(headers, "content-type");
        response.setContentType(contentTypeString);
        //response.setContentLength((int) meta.getLength());

        byte[] buf = new byte[1024];
        int c;
        while ((c = rec.read(buf)) != -1) {
            out.write(buf, 0, c);
        }
        out.flush();
        rec.close();
        arc.close();
    }

    private void handleException(HttpServletResponse response, Throwable t)
            throws UnsupportedEncodingException, IOException {
        response.setContentType("text/xml; charset=UTF-8");
        PrintWriter out = new PrintWriter(new OutputStreamWriter(response
                .getOutputStream(), "UTF-8"));
        out.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
        ArcRetrieverException are;
        if (t instanceof ArcRetrieverException) {
            are = (ArcRetrieverException) t;
        } else {
            are = new ArcRetrieverException(7, t);
        }

        out.println("<retrievermessage>");
        out.println("  <head>");
        out.println("    <errorcode>" + are.getErrorCode() + "</errorcode>");
        out.println("    <errormessage>" + are.getLocalizedMessage()
                + "</errormessage>");
        out.println("  </head>");
        if (are.getCause() != null) {
            out.println("\n  <body>");
            out.println("Cause: " + are.getCause().getClass().getName() + ": "
                    + are.getCause().getLocalizedMessage());
            out.println("\nStack trace:");
            StackTraceElement[] trace = are.getCause().getStackTrace();
            for (int i = 0; i < trace.length; i++) {
                out.println(trace[i].toString().replaceAll("<", "&lt;").replaceAll(
                        ">", "&gt;"));
            }
            out.println("  </body>");
        }
        out.println("</retrievermessage>");
        out.flush();
        out.close();
    }
}

--- NEW FILE: ArcRetrieverException.java ---
/*
 *  This file is part of The NWA Toolset.
 *
 *  Copyright (C) 2001-2002 Royal Library in Stockholm,
 *                          Royal Library in Copenhagen,
 *                          Helsinki University Library of Finland, 
 *                          National Library of Norway,
 *                          National and University Library of Iceland.
 *
 *  The NWA Toolset is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  The NWA Toolset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with The NWA Toolset; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

package no.nb.nwa.retriever;


/**
 * 
 * @author John Erik Halse
 *
 */
public class ArcRetrieverException extends Exception {
    public final static int ERROR_REQTYPE_MISSING = 1;
    public final static int ERROR_ARCHIVE_IDENTIFIER_MISSING = 2;
    public final static int ERROR_UNSUPPORTED_REQTYPE = 3;
    public final static int ERROR_UNABLE_TO_PARSE_ARCHIVE_IDENTIFIER = 4;
    public final static int ERROR_DOCUMENT_ROOT_NOT_SET = 5;
    public final static int ERROR_OBJECT_NOT_ACCESSIBLE = 6;
    public final static int ERROR_BAD_FUNCTION_ARGUMENT = 7;

    private final static String[] msg = {
        "",
        "Reqtype missing",
        "Archive Identifier missing",
        "Unsupported reqtype",
        "Unable to parse Archive Identifier",
        "Document Root not set",
        "Object not accessible",
        "Bad function argument"
    };
    
    private final int errorCode;
    
    /**
     * 
     */
    private ArcRetrieverException() {
        this.errorCode = 0;
    }

    /**
     * @param type
     */
    public ArcRetrieverException(int type) {
        super(msg[type]);
        this.errorCode = type;
    }

    /**
     * @param type
     * @param cause
     */
    public ArcRetrieverException(int type, Throwable cause) {
        super(msg[type], cause);
        this.errorCode = type;
    }
    
    public int getErrorCode() {
        return errorCode;
    }
}

[Archive-access-cvs] archive-access/projects/wera/src/java/no/nb/nwa/retriever AID.java,NONE,1.1 ARC

[Archive-access-cvs] archive-access/projects/wera/src/java/no/nb/nwa/retriever AID.java,NONE,1.1 ARCRetriever.java,NONE,1.1 ArcRetrieverException.java,NONE,1.1