From: SVN c. m. f. t. SWORD-A. p. <swo...@li...> - 2012-01-05 17:30:58
|
Revision: 416 http://sword-app.svn.sourceforge.net/sword-app/?rev=416&view=rev Author: richard-jones Date: 2012-01-05 17:30:49 +0000 (Thu, 05 Jan 2012) Log Message: ----------- first hack at dividing the original sss.py into multiple parts ready for refactoring and modularisation Modified Paths: -------------- sss/branches/sss-2/sss/sss.py Added Paths: ----------- sss/branches/sss-2/sss/config.py sss/branches/sss-2/sss/core.py sss/branches/sss-2/sss/ingesters_disseminators.py sss/branches/sss-2/sss/negotiator.py sss/branches/sss-2/sss/repository.py sss/branches/sss-2/sss/sss_logging.py sss/branches/sss-2/sss/webpy.py sss/branches/sss-2/sss/webui.py Added: sss/branches/sss-2/sss/config.py =================================================================== --- sss/branches/sss-2/sss/config.py (rev 0) +++ sss/branches/sss-2/sss/config.py 2012-01-05 17:30:49 UTC (rev 416) @@ -0,0 +1,100 @@ +import os, uuid, sys +from negotiator import ContentType +from ingesters_disseminators import DefaultEntryIngester, DefaultDisseminator, FeedDisseminator, BinaryIngester, SimpleZipIngester, METSDSpaceIngester + +class Configuration(object): + def __init__(self): + # The base url of the webservice where SSS is deployed + self.base_url = "http://localhost:%s/" % (sys.argv[1] if len(sys.argv) > 1 else '8080') + + # The number of collections that SSS will create and give to users to deposit content into + self.num_collections = 10 + + # The directory where the deposited content should be stored + self.store_dir = os.path.join(os.getcwd(), "store") + + # explicitly set the sword version, so if you're testing validation of + # service documents you can "break" it. + self.sword_version = "2.0" # SWORD 2.0! Oh yes! + + # user details; the user/password pair should be used for HTTP Basic Authentication, and the obo is the user + # to use for On-Behalf-Of requests. Set authenticate=False if you want to test the server without caring + # about authentication, set mediation=False if you want to test the server's errors on invalid attempts at + # mediation + self.authenticate = True + self.user = "sword" + self.password = "sword" + + self.mediation = True + self.obo = "obo" + + # What media ranges should the app:accept element in the Service Document support + self.app_accept = ["*/*"] + self.multipart_accept = ["*/*"] + self.accept_nothing = False + + # use these app_accept and multipart_accept values to create an invalid Service Document + #self.app_accept = None + #self.multipart_accept = None + + # should we provide sub-service urls + self.use_sub = True + + # What packaging formats should the sword:acceptPackaging element in the Service Document support + self.sword_accept_package = [ + "http://purl.org/net/sword/package/SimpleZip", + "http://purl.org/net/sword/package/Binary", + "http://purl.org/net/sword/package/METSDSpaceSIP" + ] + + # maximum upload size to be allowed, in bytes (this default is 16Mb) + self.max_upload_size = 16777216 + #self.max_upload_size = 0 # used to generate errors + + # list of package formats that SSS can provide when retrieving the Media Resource + self.sword_disseminate_package = [ + "http://purl.org/net/sword/package/SimpleZip" + ] + + # Supported package format disseminators; for the content type (dictionary key), the associated + # class will be used to package the content for dissemination + self.package_disseminators = { + ContentType("application", "zip", None, "http://purl.org/net/sword/package/SimpleZip").media_format() : DefaultDisseminator, + ContentType("application", "zip").media_format() : DefaultDisseminator, + ContentType("application", "atom+xml", "type=feed").media_format() : FeedDisseminator + } + + # Supported package format ingesters; for the Packaging header (dictionary key), the associated class will + # be used to unpackage deposited content + self.package_ingesters = { + "http://purl.org/net/sword/package/Binary" : BinaryIngester, + "http://purl.org/net/sword/package/SimpleZip" : SimpleZipIngester, + "http://purl.org/net/sword/package/METSDSpaceSIP" : METSDSpaceIngester + } + + self.entry_ingester = DefaultEntryIngester + + # supply this header in the Packaging header to generate a http://purl.org/net/sword/error/ErrorContent + # sword error + self.error_content_package = "http://purl.org/net/sword/package/error" + + # we can turn off updates and deletes in order to examine the behaviour of Method Not Allowed errors + self.allow_update = True + self.allow_delete = True + + # we can turn off deposit receipts, which is allowed by the specification + self.return_deposit_receipt = True + + # generate a UUID to represent this request, for logging purposes + self.rid = str(uuid.uuid4()) + +class CherryPyConfiguration(Configuration): + def __init__(self): + Configuration.__init__(self) + +class ApacheConfiguration(Configuration): + def __init__(self): + Configuration.__init__(self) + self.base_url = 'http://localhost/sss/' + self.store_dir = '/Users/richard/tmp/store' + self.authenticate = False Added: sss/branches/sss-2/sss/core.py =================================================================== --- sss/branches/sss-2/sss/core.py (rev 0) +++ sss/branches/sss-2/sss/core.py 2012-01-05 17:30:49 UTC (rev 416) @@ -0,0 +1,575 @@ +import web, os, base64 +from lxml import etree +from sss_logging import SSSLogger +from datetime import datetime + +# get the global logger +sssl = SSSLogger() +ssslog = sssl.getLogger() + +# create the global configuration +from config import CherryPyConfiguration +global_configuration = CherryPyConfiguration() + +# FIXME: SWORDSpec has a lot of webpy stuff in it; needs to be cleaned and +# divided + + +class Namespaces(object): + """ + This class encapsulates all the namespace declarations that we will need + """ + def __init__(self): + # AtomPub namespace and lxml format + self.APP_NS = "http://www.w3.org/2007/app" + self.APP = "{%s}" % self.APP_NS + + # Atom namespace and lxml format + self.ATOM_NS = "http://www.w3.org/2005/Atom" + self.ATOM = "{%s}" % self.ATOM_NS + + # SWORD namespace and lxml format + self.SWORD_NS = "http://purl.org/net/sword/terms/" + self.SWORD = "{%s}" % self.SWORD_NS + + # Dublin Core namespace and lxml format + self.DC_NS = "http://purl.org/dc/terms/" + self.DC = "{%s}" % self.DC_NS + + # RDF namespace and lxml format + self.RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" + self.RDF = "{%s}" % self.RDF_NS + + # ORE namespace and lxml format + self.ORE_NS = "http://www.openarchives.org/ore/terms/" + self.ORE = "{%s}" % self.ORE_NS + + # ORE ATOM + self.ORE_ATOM_NS = "http://www.openarchives.org/ore/atom/" + self.ORE_ATOM = "{%s}" % self.ORE_ATOM_NS + +# REQUEST/RESPONSE CLASSES +####################################################################### +# These classes are used as the glue between the web.py web interface layer and the underlying sword server, allowing +# them to exchange messages agnostically to the interface + +class Auth(object): + def __init__(self, by=None, obo=None, target_owner_unknown=False): + self.by = by + self.obo = obo + self.target_owner_unknown = target_owner_unknown + + def success(self): + return self.by is not None and not self.target_owner_unknown + +class SWORDRequest(object): + """ + General class to represent any sword request (such as deposit or delete) + """ + def __init__(self): + """ + There are 4 HTTP sourced properties: + - on_behalf_of - On-Behalf-Of in HTTP; the user being deposited on behalf of + - packaging - Packaging in HTTP; the packaging format being used + - in_progress - In-Progress in HTTP; whether the deposit is complete or not from a client perspective + - metadata_relevant - Metadata-Relevant; whether or not the deposit contains relevant metadata + """ + + self.on_behalf_of = None + self.packaging = "http://purl.org/net/sword/package/Binary" # if this isn't populated externally, use the default + self.in_progress = False + self.metadata_relevant = True # the server MAY assume that it is True + self.auth = None + self.content_md5 = None + self.slug = None + + def set_by_header(self, key, value): + # FIXME: this is a webpy thing.... + """ + Convenience method to take a relevant HTTP header and its value and add it to this object. + e.g. set_by_header("On-Behalf-Of", "richard") Notice that the format of the headers used + here is the web.py format which is all upper case, preceeding with HTTP_ with all - converted to _ + (for some unknown reason) + """ + ssslog.debug("Setting Header %s : %s" % (key, value)) + if key == "HTTP_ON_BEHALF_OF": + self.on_behalf_of = value + elif key == "HTTP_PACKAGING" and value is not None: + self.packaging = value + elif key == "HTTP_IN_PROGRESS": + self.in_progress = (value.strip() == "true") + elif key == "HTTP_METADATA_RELEVANT": + self.metadata_relevant = (value.strip() == "true") + elif key == "HTTP_CONTENT_MD5": + self.content_md5 = value + elif key == "HTTP_SLUG": + self.slug = value + +class DepositRequest(SWORDRequest): + """ + Class to represent a request to deposit some content onto the server + """ + def __init__(self): + """ + There are 3 content related properties: + - content - the incoming content file to be deposited + - atom - the incoming atom document to be deposited (may be None) + - filename - the desired name of the incoming content + """ + SWORDRequest.__init__(self) + + # content related + self.content_type = "application/octet-stream" + self.content = None + self.atom = None + self.filename = "unnamed.file" + self.too_large = False + +class DepositResponse(object): + """ + Class to represent the response to a deposit request + """ + def __init__(self): + """ + Properties: + - created - was the resource created on the server + - accepted - was the resource accepted by the server (but not yet created) + - error_code - if there was an error, what HTTP status code + - error - sword error document if relevant + - receipt - deposit receipt if successful deposit + - location - the Edit-URI which will be supplied to the client as the Location header in responses + """ + self.created = False + self.accepted = False + self.error_code = None + self.error = None + self.receipt = None + self.location = None + +class MediaResourceResponse(object): + """ + Class to represent the response to a request to retrieve the Media Resource + """ + def __init__(self): + """ + There are three properties: + redirect - boolean, does the client need to be redirected to another URL for the media resource + url - If redirect, then this is the URL to redirect the client to + filepath - If not redirect, then this is the path to the file that the server should serve + """ + self.redirect = False + self.url = None + self.filepath = None + self.packaging = None + +class DeleteRequest(SWORDRequest): + """ + Class Representing a request to delete either the content or the container itself. + """ + def __init__(self): + """ + The properties of this class are as per SWORDRequest + """ + SWORDRequest.__init__(self) + +class DeleteResponse(object): + """ + Class to represent the response to a request to delete the content or the container + """ + def __init__(self): + """ + There are 3 properties: + error_code - if there was an error, the http code associated + error - the sworderror if appropriate + receipt - if successful and a request for deleting content (not container) the deposit receipt + """ + self.error_code = None + self.error = None + self.receipt = None + +# Operational SWORD Classes +############################################################################# +# Classes which carry out the grunt work of the SSS + +class SWORDSpec(object): + """ + Class which attempts to represent the specification itself. Instead of being operational like the SWORDServer + class, it attempts to just be able to interpret the supplied http headers and content bodies and turn them into + the entities with which SWORD works. The jury is out, in my mind, whether this class is a useful separation, but + for what it's worth, here it is ... + """ + def __init__(self): + # FIXME: this is a webpy thing ... + # The HTTP headers that are part of the specification (from a web.py perspective - don't be fooled, these + # aren't the real HTTP header names - see the spec) + self.sword_headers = [ + "HTTP_ON_BEHALF_OF", "HTTP_PACKAGING", "HTTP_IN_PROGRESS", "HTTP_METADATA_RELEVANT", + "HTTP_CONTENT_MD5", "HTTP_SLUG", "HTTP_ACCEPT_PACKAGING" + ] + + self.error_content_uri = "http://purl.org/net/sword/error/ErrorContent" + self.error_checksum_mismatch_uri = "http://purl.org/net/sword/error/ErrorChecksumMismatch" + self.error_bad_request_uri = "http://purl.org/net/sword/error/ErrorBadRequest" + self.error_target_owner_unknown_uri = "http://purl.org/net/sword/error/TargetOwnerUnknown" + self.error_mediation_not_allowed_uri = "http://purl.org/net/sword/error/MediationNotAllowed" + self.error_method_not_allowed_uri = "http://purl.org/net/sword/error/MethodNotAllowed" + self.error_max_upload_size_exceeded = "http://purl.org/net/sword/error/MaxUploadSizeExceeded" + + def validate_deposit_request(self, web, allow_multipart=True): + dict = web.ctx.environ + + # get each of the allowed SWORD headers that can be validated and see if they do + ip = dict.get("HTTP_IN_PROGRESS") + if ip is not None and ip != "true" and ip != "false": + return "In-Progress must be 'true' or 'false'" + + sm = dict.get("HTTP_METADATA_RELEVANT") + if sm is not None and sm != "true" and sm != "false": + return "Metadata-Relevant must be 'true' or 'false'" + + # there must be both an "atom" and "payload" input or data in web.data() + webin = web.input() + if len(webin) != 2 and len(webin) > 0: + return "Multipart request does not contain exactly 2 parts" + if len(webin) >= 2 and not webin.has_key("atom") and not webin.has_key("payload"): + return "Multipart request must contain Content-Dispositions with names 'atom' and 'payload'" + if len(webin) > 0 and not allow_multipart: + return "Multipart request not permitted in this context" + + # if we get to here then we have a valid multipart or no multipart + if len(webin) != 2: # if it is not multipart + if web.data() is None: # and there is no content + return "No content sent to the server" + + # validates + return None + + def validate_delete_request(self, web): + dict = web.ctx.environ + + # get each of the allowed SWORD headers that can be validated and see if they do + ip = dict.get("HTTP_IN_PROGRESS") + if ip is not None and ip != "true" and ip != "false": + return "In-Progress must be 'true' or 'false'" + + sm = dict.get("HTTP_METADATA_RELEVANT") + if sm is not None and sm != "true" and sm != "false": + return "Metadata-Relevant must be 'true' or 'false'" + + # validates + return None + + def get_deposit(self, web, auth=None, atom_only=False): + # FIXME: this reads files into memory, and therefore does not scale + # FIXME: this does not deal with the Media Part headers on a multipart deposit + """ + Take a web.py web object and extract from it the parameters and content required for a SWORD deposit. This + includes determining whether this is an Atom Multipart request or not, and extracting the atom/payload where + appropriate. It also includes extracting the HTTP headers which are relevant to deposit, and for those not + supplied providing their defaults in the returned DepositRequest object + """ + d = DepositRequest() + + # now go through the headers and populate the Deposit object + dict = web.ctx.environ + + # get the headers that have been provided. Any headers which have not been provided have default values + # supplied in the DepositRequest object's constructor + ssslog.debug("Incoming HTTP headers: " + str(dict)) + empty_request = False + for head in dict.keys(): + if head in self.sword_headers: + d.set_by_header(head, dict[head]) + if head == "HTTP_CONTENT_DISPOSITION": + ssslog.debug("Reading Header %s : %s" % (head, dict[head])) + d.filename = self.extract_filename(dict[head]) + ssslog.debug("Extracted filename %s from %s" % (d.filename, dict[head])) + if head == "CONTENT_TYPE": + ssslog.debug("Reading Header %s : %s" % (head, dict[head])) + ct = dict[head] + d.content_type = ct + if ct.startswith("application/atom+xml"): + atom_only = True + if head == "CONTENT_LENGTH": + ssslog.debug("Reading Header %s : %s" % (head, dict[head])) + if dict[head] == "0": + empty_request = True + cl = int(dict[head]) # content length as an integer + if cl > global_configuration.max_upload_size: + d.too_large = True + return d + + # first we need to find out if this is a multipart or not + webin = web.input() + if len(webin) == 2: + ssslog.info("Received multipart deposit request") + d.atom = webin['atom'] + # read the zip file from the base64 encoded string + d.content = base64.decodestring(webin['payload']) + elif not empty_request: + # if this wasn't a multipart, and isn't an empty request, then the data is in web.data(). This could be a binary deposit or + # an atom entry deposit - reply on the passed/determined argument to determine which + if atom_only: + ssslog.info("Received Entry deposit request") + d.atom = web.data() + else: + ssslog.info("Received Binary deposit request") + d.content = web.data() + + # now just attach the authentication data and return + d.auth = auth + return d + + def extract_filename(self, cd): + """ get the filename out of the content disposition header """ + # ok, this is a bit obtuse, but it was fun making it. It's not hard to understand really, if you break + # it down + return cd[cd.find("filename=") + len("filename="):cd.find(";", cd.find("filename=")) if cd.find(";", cd.find("filename=")) > -1 else len(cd)] + + def get_delete(self, dict, auth=None): + """ + Take a web.py web object and extract from it the parameters and content required for a SWORD delete request. + It mainly extracts the HTTP headers which are relevant to delete, and for those not supplied provides thier + defaults in the returned DeleteRequest object + """ + d = DeleteRequest() + + # we just want to parse out the headers that are relevant + for head in dict.keys(): + if head in self.sword_headers: + d.set_by_header(head, dict[head]) + + # now just attach the authentication data and return + d.auth = auth + return d + +class Statement(object): + """ + Class representing the Statement; a description of the object as it appears on the server + """ + def __init__(self): + """ + The statement has 4 important properties: + - aggregation_uri - The URI of the aggregation in ORE terms + - rem_uri - The URI of the Resource Map in ORE terms + - original_deposits - The list of original packages uploaded to the server (set with original_deposit()) + - in_progress - Is the submission in progress (boolean) + - aggregates - the non-original deposit files associated with the item + """ + self.aggregation_uri = None + self.rem_uri = None + self.original_deposits = [] + self.aggregates = [] + self.in_progress = False + + # URIs to use for the two supported states in SSS + self.in_progress_uri = "http://purl.org/net/sword/state/in-progress" + self.archived_uri = "http://purl.org/net/sword/state/archived" + + # the descriptions to associated with the two supported states in SSS + self.states = { + self.in_progress_uri : "The work is currently in progress, and has not passed to a reviewer", + self.archived_uri : "The work has passed through review and is now in the archive" + } + + # Namespace map for XML serialisation + self.ns = Namespaces() + self.smap = {"rdf" : self.ns.RDF_NS, "ore" : self.ns.ORE_NS, "sword" : self.ns.SWORD_NS} + self.asmap = {"oreatom" : self.ns.ORE_ATOM_NS, "atom" : self.ns.ATOM_NS, "rdf" : self.ns.RDF_NS, "ore" : self.ns.ORE_NS, "sword" : self.ns.SWORD_NS} + self.fmap = {"atom" : self.ns.ATOM_NS, "sword" : self.ns.SWORD_NS} + + def __str__(self): + return str(self.aggregation_uri) + ", " + str(self.rem_uri) + ", " + str(self.original_deposits) + + def original_deposit(self, uri, deposit_time, packaging_format, by, obo): + """ + Add an original deposit to the statement + Args: + - uri: The URI to the original deposit + - deposit_time: When the deposit was originally made + - packaging_format: The package format of the deposit, as supplied in the Packaging header + """ + self.original_deposits.append((uri, deposit_time, packaging_format, by, obo)) + + def add_normalised_aggregations(self, aggs): + for agg in aggs: + if agg not in self.aggregates: + self.aggregates.append(agg) + + def load(self, filepath): + """ + Populate this statement object from the XML serialised statement to be found at the specified filepath + """ + f = open(filepath, "r") + rdf = etree.fromstring(f.read()) + + aggs = [] + ods = [] + for desc in rdf.getchildren(): + packaging = None + depositedOn = None + deposit_by = None + deposit_obo = None + about = desc.get(self.ns.RDF + "about") + for element in desc.getchildren(): + if element.tag == self.ns.ORE + "aggregates": + resource = element.get(self.ns.RDF + "resource") + aggs.append(resource) + if element.tag == self.ns.ORE + "describes": + resource = element.get(self.ns.RDF + "resource") + self.aggregation_uri = resource + self.rem_uri = about + if element.tag == self.ns.SWORD + "state": + state = element.get(self.ns.RDF + "resource") + self.in_progress = state == "http://purl.org/net/sword/state/in-progress" + if element.tag == self.ns.SWORD + "packaging": + packaging = element.get(self.ns.RDF + "resource") + if element.tag == self.ns.SWORD + "depositedOn": + deposited = element.text + depositedOn = datetime.strptime(deposited, "%Y-%m-%dT%H:%M:%SZ") + if element.tag == self.ns.SWORD + "depositedBy": + deposit_by = element.text + if element.tag == self.ns.SWORD + "depositedOnBehalfOf": + deposit_obo = element.text + if packaging is not None: + ods.append(about) + self.original_deposit(about, depositedOn, packaging, deposit_by, deposit_obo) + + # sort out the ordinary aggregations from the original deposits + self.aggregates = [] + for agg in aggs: + if agg not in ods: + self.aggregates.append(agg) + + def serialise(self): + """ + Serialise this statement into an RDF/XML string + """ + rdf = self.get_rdf_xml() + return etree.tostring(rdf, pretty_print=True) + + def serialise_atom(self): + """ + Serialise this statement to an Atom Feed document + """ + # create the root atom feed element + feed = etree.Element(self.ns.ATOM + "feed", nsmap=self.fmap) + + # create the sword:state term in the root of the feed + state_uri = self.in_progress_uri if self.in_progress else self.archived_uri + state = etree.SubElement(feed, self.ns.SWORD + "state") + state.set("href", state_uri) + meaning = etree.SubElement(state, self.ns.SWORD + "stateDescription") + meaning.text = self.states[state_uri] + + # now do an entry for each original deposit + for (uri, datestamp, format_uri, by, obo) in self.original_deposits: + # FIXME: this is not an official atom entry yet + entry = etree.SubElement(feed, self.ns.ATOM + "entry") + + category = etree.SubElement(entry, self.ns.ATOM + "category") + category.set("scheme", self.ns.SWORD_NS) + category.set("term", self.ns.SWORD_NS + "originalDeposit") + category.set("label", "Orignal Deposit") + + # Media Resource Content URI (Cont-URI) + content = etree.SubElement(entry, self.ns.ATOM + "content") + content.set("type", "application/zip") + content.set("src", uri) + + # add all the foreign markup + + format = etree.SubElement(entry, self.ns.SWORD + "packaging") + format.text = format_uri + + deposited = etree.SubElement(entry, self.ns.SWORD + "depositedOn") + deposited.text = datestamp.strftime("%Y-%m-%dT%H:%M:%SZ") + + deposit_by = etree.SubElement(entry, self.ns.SWORD + "depositedBy") + deposit_by.text = by + + if obo is not None: + deposit_obo = etree.SubElement(entry, self.ns.SWORD + "depositedOnBehalfOf") + deposit_obo.text = obo + + # finally do an entry for all the ordinary aggregated resources + for uri in self.aggregates: + entry = etree.SubElement(feed, self.ns.ATOM + "entry") + content = etree.SubElement(entry, self.ns.ATOM + "content") + content.set("type", "application/octet-stream") + content.set("src", uri) + + return etree.tostring(feed, pretty_print=True) + + def get_rdf_xml(self): + """ + Get an lxml Element object back representing this statement + """ + + # we want to create an ORE resource map, and also add on the sword specific bits for the original deposits and the state + + # create the RDF root + rdf = etree.Element(self.ns.RDF + "RDF", nsmap=self.smap) + + # in the RDF root create a Description for the REM which ore:describes the Aggregation + description1 = etree.SubElement(rdf, self.ns.RDF + "Description") + description1.set(self.ns.RDF + "about", self.rem_uri) + describes = etree.SubElement(description1, self.ns.ORE + "describes") + describes.set(self.ns.RDF + "resource", self.aggregation_uri) + + # in the RDF root create a Description for the Aggregation which is ore:isDescribedBy the REM + description = etree.SubElement(rdf, self.ns.RDF + "Description") + description.set(self.ns.RDF + "about", self.aggregation_uri) + idb = etree.SubElement(description, self.ns.ORE + "isDescribedBy") + idb.set(self.ns.RDF + "resource", self.rem_uri) + + # Create ore:aggreages for all ordinary aggregated files + for uri in self.aggregates: + aggregates = etree.SubElement(description, self.ns.ORE + "aggregates") + aggregates.set(self.ns.RDF + "resource", uri) + + # Create ore:aggregates and sword:originalDeposit relations for the original deposits + for (uri, datestamp, format, by, obo) in self.original_deposits: + # standard ORE aggregates statement + aggregates = etree.SubElement(description, self.ns.ORE + "aggregates") + aggregates.set(self.ns.RDF + "resource", uri) + + # assert that this is an original package + original = etree.SubElement(description, self.ns.SWORD + "originalDeposit") + original.set(self.ns.RDF + "resource", uri) + + # now do the state information + state_uri = self.in_progress_uri if self.in_progress else self.archived_uri + state = etree.SubElement(description, self.ns.SWORD + "state") + state.set(self.ns.RDF + "resource", state_uri) + + # Build the Description elements for the original deposits, with their sword:depositedOn and sword:packaging + # relations + for (uri, datestamp, format_uri, by, obo) in self.original_deposits: + desc = etree.SubElement(rdf, self.ns.RDF + "Description") + desc.set(self.ns.RDF + "about", uri) + + format = etree.SubElement(desc, self.ns.SWORD + "packaging") + format.set(self.ns.RDF + "resource", format_uri) + + deposited = etree.SubElement(desc, self.ns.SWORD + "depositedOn") + deposited.set(self.ns.RDF + "datatype", "http://www.w3.org/2001/XMLSchema#dateTime") + deposited.text = datestamp.strftime("%Y-%m-%dT%H:%M:%SZ") + + deposit_by = etree.SubElement(desc, self.ns.SWORD + "depositedBy") + deposit_by.set(self.ns.RDF + "datatype", "http://www.w3.org/2001/XMLSchema#string") + deposit_by.text = by + + if obo is not None: + deposit_obo = etree.SubElement(desc, self.ns.SWORD + "depositedOnBehalfOf") + deposit_obo.set(self.ns.RDF + "datatype", "http://www.w3.org/2001/XMLSchema#string") + deposit_obo.text = obo + + # finally do a description for the state + sdesc = etree.SubElement(rdf, self.ns.RDF + "Description") + sdesc.set(self.ns.RDF + "about", state_uri) + meaning = etree.SubElement(sdesc, self.ns.SWORD + "stateDescription") + meaning.text = self.states[state_uri] + + return rdf + Added: sss/branches/sss-2/sss/ingesters_disseminators.py =================================================================== --- sss/branches/sss-2/sss/ingesters_disseminators.py (rev 0) +++ sss/branches/sss-2/sss/ingesters_disseminators.py 2012-01-05 17:30:49 UTC (rev 416) @@ -0,0 +1,281 @@ +from zipfile import ZipFile +from lxml import etree + +# get the global logger +from sss_logging import SSSLogger +sssl = SSSLogger() +ssslog = sssl.getLogger() + +# FIXME: this is a duplicate of the one in core. We need to sort out our +# circular imports *urgh* +class Namespaces(object): + """ + This class encapsulates all the namespace declarations that we will need + """ + def __init__(self): + # AtomPub namespace and lxml format + self.APP_NS = "http://www.w3.org/2007/app" + self.APP = "{%s}" % self.APP_NS + + # Atom namespace and lxml format + self.ATOM_NS = "http://www.w3.org/2005/Atom" + self.ATOM = "{%s}" % self.ATOM_NS + + # SWORD namespace and lxml format + self.SWORD_NS = "http://purl.org/net/sword/terms/" + self.SWORD = "{%s}" % self.SWORD_NS + + # Dublin Core namespace and lxml format + self.DC_NS = "http://purl.org/dc/terms/" + self.DC = "{%s}" % self.DC_NS + + # RDF namespace and lxml format + self.RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" + self.RDF = "{%s}" % self.RDF_NS + + # ORE namespace and lxml format + self.ORE_NS = "http://www.openarchives.org/ore/terms/" + self.ORE = "{%s}" % self.ORE_NS + + # ORE ATOM + self.ORE_ATOM_NS = "http://www.openarchives.org/ore/atom/" + self.ORE_ATOM = "{%s}" % self.ORE_ATOM_NS + +class DisseminationPackager(object): + def __init__(self, dao, uri_manager): + pass + + """ + Interface for all classes wishing to provide dissemination packaging services to the SSS + """ + def package(self, collection, id): + """ + Package up all the content in the specified container. This method must be implemented by the extender. The + method should create a package in the store directory, and then return to the caller the path to that file + so that it can be served back to the client + """ + pass + + def get_uri(self): + return "http://purl.org/net/sword/package/SimpleZip" + +class IngestPackager(object): + def __init__(self, dao): + pass + + def ingest(self, collection, id, filename, metadata_relevant): + """ + The package with the supplied filename has been placed in the identified container. This should be inspected + and unpackaged. Implementations should note that there is optionally an atom document in the container which + may need to be inspected, and this can be retrieved from DAO.get_atom_content(). If the metadata_relevant + argument is False, implementations should not change the already extracted metadata in the container + """ + return [] + +class DefaultDisseminator(DisseminationPackager): + """ + Basic default packager, this just zips up everything except the SSS specific files in the container and stores + them in a file called sword-default-package.zip. + """ + def __init__(self, dao, uri_manager): + self.dao = dao + + def package(self, collection, id): + """ package up the content """ + + # get a list of the relevant content files + files = self.dao.list_content(collection, id, exclude=["sword-default-package.zip"]) + + # create a zip file with all the original zip files in it + zpath = self.dao.get_store_path(collection, id, "sword-default-package.zip") + z = ZipFile(zpath, "w") + for file in files: + z.write(self.dao.get_store_path(collection, id, file), file) + z.close() + + # return the path to the package to the caller + return zpath + +class FeedDisseminator(DisseminationPackager): + def __init__(self, dao, uri_manager): + self.dao = dao + self.ns = Namespaces() + self.um = uri_manager + self.nsmap = {None: self.ns.ATOM_NS} + + def package(self, collection, id): + """ create a feed representation of the package """ + # get a list of the relevant content files + files = self.dao.list_content(collection, id, exclude=["mediaresource.feed.xml"]) + + # create a feed object with all the files as entries + feed = etree.Element(self.ns.ATOM + "feed", nsmap=self.nsmap) + + for file in files: + entry = etree.SubElement(feed, self.ns.ATOM + "entry") + + em = etree.SubElement(entry, self.ns.ATOM + "link") + em.set("rel", "edit-media") + em.set("href", self.um.part_uri(collection, id, file)) + + edit = etree.SubElement(entry, self.ns.ATOM + "link") + edit.set("rel", "edit") + edit.set("href", self.um.part_uri(collection, id, file) + ".atom") + + content = etree.SubElement(entry, self.ns.ATOM + "link") + content.set("type", "application/octet-stream") # FIXME: we're not storing content types, so we don't know + content.set("src", self.um.part_uri(collection, id, file)) + + fpath = self.dao.get_store_path(collection, id, "mediaresource.feed.xml") + f = open(fpath, "wb") + f.write(etree.tostring(feed, pretty_print=True)) + f.close() + + return fpath + + def get_uri(self): + return None + +class BinaryIngester(IngestPackager): + def __init__(self, dao): + pass + + def ingest(self, collection, id, filename, metadata_relevant): + # does nothing, we don't try to unpack binary deposits + return [] + +class SimpleZipIngester(IngestPackager): + def __init__(self, dao): + self.dao = dao + self.ns = Namespaces() + + def ingest(self, collection, id, filename, metadata_relevant=True): + # First, let's just extract all the contents of the zip + z = ZipFile(self.dao.get_store_path(collection, id, filename)) + + # keep track of the names of the files in the zip, as these will become + # our derived resources + derived_resources = z.namelist() + + # FIXME: what we do here is intrinsically insecure, but SSS is not a + # production service, so we're not worrying about it! + path = self.dao.get_store_path(collection, id) + z.extractall(path) + + # check for the atom document + atom = self.dao.get_atom_content(collection, id) + if atom is None: + # there's no metadata to extract so just leave it + return derived_resources + + # if the metadata is not relevant, then we don't need to continue + if not metadata_relevant: + return derived_resources + + metadata = {} + entry = etree.fromstring(atom) + + # go through each element in the atom entry and just process the ones we care about + # explicitly retrieve the atom based metadata first + for element in entry.getchildren(): + if element.tag == self.ns.ATOM + "title": + self.a_insert(metadata, "title", element.text.strip()) + if element.tag == self.ns.ATOM + "updated": + self.a_insert(metadata, "date", element.text.strip()) + if element.tag == self.ns.ATOM + "author": + authors = "" + for names in element.getchildren(): + authors += names.text.strip() + " " + self.a_insert(metadata, "creator", authors.strip()) + if element.tag == self.ns.ATOM + "summary": + self.a_insert(metadata, "abstract", element.text.strip()) + + # now go through and retrieve the dcterms from the entry + for element in entry.getchildren(): + if not isinstance(element.tag, basestring): + continue + + # we operate an additive policy with metadata. Duplicate + # keys are allowed, but duplicate key/value pairs are not. + if element.tag.startswith(self.ns.DC): + key = element.tag[len(self.ns.DC):] + val = element.text.strip() + self.a_insert(metadata, key, val) + + self.dao.store_metadata(collection, id, metadata) + + return derived_resources + + def a_insert(self, d, key, value): + if d.has_key(key): + vs = d[key] + if value not in vs: + d[key].append(value) + else: + d[key] = [value] + +class METSDSpaceIngester(IngestPackager): + def ingest(self, collection, id, filename, metadata_relevant): + # we don't need to implement this, it is just for example. it would unzip the file and import the metadata + # in the zip file + return [] + +class DefaultEntryIngester(object): + def __init__(self, dao): + self.dao = dao + self.ns = Namespaces() + + def ingest(self, collection, id, atom, additive=False): + ssslog.debug("Ingesting Metadata; Additive? " + str(additive)) + + # store the atom + self.dao.store_atom(collection, id, atom) + + # now extract/augment the metadata + metadata = {} + if additive: + # start with any existing metadata + metadata = self.dao.get_metadata(collection, id) + + ssslog.debug("Existing Metadata (before new ingest): " + str(metadata)) + + entry = etree.fromstring(atom) + + # go through each element in the atom entry and just process the ones we care about + # explicitly retrieve the atom based metadata first + for element in entry.getchildren(): + if element.tag == self.ns.ATOM + "title": + self.a_insert(metadata, "title", element.text.strip()) + if element.tag == self.ns.ATOM + "updated": + self.a_insert(metadata, "date", element.text.strip()) + if element.tag == self.ns.ATOM + "author": + authors = "" + for names in element.getchildren(): + authors += names.text.strip() + " " + self.a_insert(metadata, "creator", authors.strip()) + if element.tag == self.ns.ATOM + "summary": + self.a_insert(metadata, "abstract", element.text.strip()) + + # now go through and retrieve the dcterms from the entry + for element in entry.getchildren(): + if not isinstance(element.tag, basestring): + continue + + # we operate an additive policy with metadata. Duplicate + # keys are allowed, but duplicate key/value pairs are not. + if element.tag.startswith(self.ns.DC): + key = element.tag[len(self.ns.DC):] + val = element.text.strip() + self.a_insert(metadata, key, val) + + ssslog.debug("Current Metadata (extracted + previously existing): " + str(metadata)) + + self.dao.store_metadata(collection, id, metadata) + + def a_insert(self, d, key, value): + if d.has_key(key): + vs = d[key] + if value not in vs: + d[key].append(value) + else: + d[key] = [value] Added: sss/branches/sss-2/sss/negotiator.py =================================================================== --- sss/branches/sss-2/sss/negotiator.py (rev 0) +++ sss/branches/sss-2/sss/negotiator.py 2012-01-05 17:30:49 UTC (rev 416) @@ -0,0 +1,342 @@ +# get the global logger +from sss_logging import SSSLogger +sssl = SSSLogger() +ssslog = sssl.getLogger() + +# CONTENT NEGOTIATION +####################################################################### +# A sort of generic tool for carrying out content negotiation tasks with the web interface + +class ContentType(object): + """ + Class to represent a content type requested through content negotiation + """ + def __init__(self, type=None, subtype=None, params=None, packaging=None): + """ + Properties: + type - the main type of the content. e.g. in text/html, the type is "text" + subtype - the subtype of the content. e.g. in text/html the subtype is "html" + params - as per the mime specification, his represents the parameter extension to the type, e.g. with + application/atom+xml;type=entry, the params are "type=entry" + + So, for example: + application/atom+xml;type=entry => type="application", subtype="atom+xml", params="type=entry" + """ + self.type = type + self.subtype = subtype + self.params = params + self.packaging = packaging + + def from_mimetype(self, mimetype): + # mimetype is of the form <supertype>/<subtype>[;<params>] + parts = mimetype.split(";") + if len(parts) == 2: + self.type, self.subtype = parts[0].split("/", 1) + self.params = parts[1] + elif len(parts) == 1: + self.type, self.subtype = parts[0].split("/", 1) + + def mimetype(self): + """ + Turn the content type into its mimetype representation + """ + mt = self.type + "/" + self.subtype + if self.params is not None: + mt += ";" + self.params + return mt + + # NOTE: we only use this to construct a canonical form which includes the package to do comparisons over + def media_format(self): + mime = self.mimetype() + pack = "" + if self.packaging is not None: + pack = "(packaging=\"" + self.packaging + "\") " + mf = "(& (type=\"" + mime + "\") " + pack + ")" + return mf + + def matches(self, other, packaging_wildcard=False): + """ + Determine whether this ContentType and the supplied other ContentType are matches. This includes full equality + or whether the wildcards (*) which can be supplied for type or subtype properties are in place in either + partner in the match. + """ + tmatch = self.type == "*" or other.type == "*" or self.type == other.type + smatch = self.subtype == "*" or other.subtype == "*" or self.subtype == other.subtype + # FIXME: there is some ambiguity in mime as to whether the omission of the params part is the same as + # a wildcard. For the purposes of convenience we have assumed here that it is, otherwise a request for + # */* will not match any content type which has parameters + pmatch = self.params is None or other.params is None or self.params == other.params + + # A similar problem exists for packaging. We allow the user to tell us if packaging should be + # wildcard sensitive + packmatch = False + if packaging_wildcard: + packmatch = self.packaging is None or other.packaging is None or self.packaging == other.packaging + else: + packmatch = self.packaging == other.packaging + return tmatch and smatch and pmatch and packmatch + + def __eq__(self, other): + return self.media_format() == other.media_format() + + def __str__(self): + return self.media_format() + + def __repr__(self): + return str(self) + +class ContentNegotiator(object): + """ + Class to manage content negotiation. Given its input parameters it will provide a ContentType object which + the server can use to locate its resources + """ + def __init__(self): + """ + There are 4 parameters which must be set in order to start content negotiation + - acceptable - What ContentType objects are acceptable to return (in order of preference) + - default_type - If no Accept header is found use this type + - default_subtype - If no Accept header is found use this subtype + - default_params - If no Accept header is found use this subtype + """ + self.acceptable = [] + self.default_type = None + self.default_subtype = None + self.default_params = None + self.default_packaging = None + + def get_accept(self, dict): + """ + Get the Accept header out of the web.py HTTP dictionary. Return None if no accept header exists + """ + if dict.has_key("HTTP_ACCEPT"): + return dict["HTTP_ACCEPT"] + return None + + def get_packaging(self, dict): + if dict.has_key('HTTP_ACCEPT_PACKAGING'): + return dict['HTTP_ACCEPT_PACKAGING'] + return None + + def analyse_accept(self, accept, packaging=None): + # FIXME: we need to somehow handle q=0.0 in here and in other related methods + """ + Analyse the Accept header string from the HTTP headers and return a structured dictionary with each + content types grouped by their common q values, thus: + + dict = { + 1.0 : [<ContentType>, <ContentType>], + 0.8 : [<ContentType], + 0.5 : [<ContentType>, <ContentType>] + } + + This method will guarantee that ever content type has some q value associated with it, even if this was not + supplied in the original Accept header; it will be inferred based on the rules of content negotiation + """ + # accept headers are a list of content types and q values, in a comma separated list + parts = accept.split(",") + + # set up some registries for the coming analysis. unsorted will hold each part of the accept header following + # its analysis, but without respect to its position in the preferences list. highest_q and counter will be + # recorded during this first run so that we can use them to sort the list later + unsorted = [] + highest_q = 0.0 + counter = 0 + + # go through each possible content type and analyse it along with its q value + for part in parts: + # count the part number that we are working on, starting from 1 + counter += 1 + + # the components of the part can be "type;params;q" "type;params", "type;q" or just "type" + components = part.split(";") + + # the first part is always the type (see above comment) + type = components[0].strip() + + # create some default values for the other parts. If there is no params, we will use None, if there is + # no q we will use a negative number multiplied by the position in the list of this part. This allows us + # to later see the order in which the parts with no q value were listed, which is important + params = None + q = -1 * counter + + # There are then 3 possibilities remaining to check for: "type;q", "type;params" and "type;params;q" + # ("type" is already handled by the default cases set up above) + if len(components) == 2: + # "type;q" or "type;params" + if components[1].strip().startswith("q="): + # "type;q" + q = components[1].strip()[2:] # strip the "q=" from the start of the q value + # if the q value is the highest one we've seen so far, record it + if float(q) > highest_q: + highest_q = float(q) + else: + # "type;params" + params = components[1].strip() + elif len(components) == 3: + # "type;params;q" + params = components[1].strip() + q = components[1].strip()[2:] # strip the "q=" from the start of the q value + # if the q value is the highest one we've seen so far, record it + if float(q) > highest_q: + highest_q = float(q) + + # at the end of the analysis we have all of the components with or without their default values, so we + # just record the analysed version for the time being as a tuple in the unsorted array + unsorted.append((type, params, q)) + + # once we've finished the analysis we'll know what the highest explicitly requested q will be. This may leave + # us with a gap between 1.0 and the highest requested q, into which we will want to put the content types which + # did not have explicitly assigned q values. Here we calculate the size of that gap, so that we can use it + # later on in positioning those elements. Note that the gap may be 0.0. + q_range = 1.0 - highest_q + + # set up a dictionary to hold our sorted results. The dictionary will be keyed with the q value, and the + # value of each key will be an array of ContentType objects (in no particular order) + sorted = {} + + # go through the unsorted list + for (type, params, q) in unsorted: + # break the type into super and sub types for the ContentType constructor + supertype, subtype = type.split("/", 1) + if q > 0: + # if the q value is greater than 0 it was explicitly assigned in the Accept header and we can just place + # it into the sorted dictionary + self.insert(sorted, q, ContentType(supertype, subtype, params, packaging)) + else: + # otherwise, we have to calculate the q value using the following equation which creates a q value "qv" + # within "q_range" of 1.0 [the first part of the eqn] based on the fraction of the way through the total + # accept header list scaled by the q_range [the second part of the eqn] + qv = (1.0 - q_range) + (((-1 * q)/counter) * q_range) + self.insert(sorted, qv, ContentType(supertype, subtype, params, packaging)) + + # now we have a dictionary keyed by q value which we can return + return sorted + + def insert(self, d, q, v): + """ + Utility method: if dict d contains key q, then append value v to the array which is identified by that key + otherwise create a new key with the value of an array with a single value v + """ + if d.has_key(q): + d[q].append(v) + else: + d[q] = [v] + + def contains_match(self, source, target): + """ + Does the target list of ContentType objects contain a match for the supplied source + Args: + - source: A ContentType object which we want to see if it matches anything in the target + - target: A list of ContentType objects to try to match the source against + Returns the matching ContentTYpe from the target list, or None if no such match + """ + for ct in target: + if source.matches(ct): + # matches are symmetrical, so source.matches(ct) == ct.matches(source) so way round is irrelevant + # we return the target's content type, as this is considered the definitive list of allowed + # content types, while the source may contain wildcards + return ct + return None + + def get_acceptable(self, client, server): + """ + Take the client content negotiation requirements - as returned by analyse_accept() - and the server's + array of supported types (in order of preference) and determine the most acceptable format to return. + + This method always returns the client's most preferred format if the server supports it, irrespective of the + server's preference. If the client has no discernable preference between two formats (i.e. they have the same + q value) then the server's preference is taken into account. + + Returns a ContentType object represening the mutually acceptable content type, or None if no agreement could + be reached. + """ + + # get the client requirement keys sorted with the highest q first (the server is a list which should be + # in order of preference already) + ckeys = client.keys() + ckeys.sort(reverse=True) + + # the rule for determining what to return is that "the client's preference always wins", so we look for the + # highest q ranked item that the server is capable of returning. We only take into account the server's + # preference when the client has two equally weighted preferences - in that case we take the server's + # preferred content type + for q in ckeys: + # for each q in order starting at the highest + possibilities = client[q] + allowable = [] + for p in possibilities: + # for each content type with the same q value + + # find out if the possibility p matches anything in the server. This uses the ContentType's + # matches() method which will take into account wildcards, so content types like */* will match + # appropriately. We get back from this the concrete ContentType as specified by the server + # if there is a match, so we know the result contains no unintentional wildcards + match = self.contains_match(p, server) + if match is not None: + # if there is a match, register it + allowable.append(match) + + # we now know if there are 0, 1 or many allowable content types at this q value + if len(allowable) == 0: + # we didn't find anything, so keep looking at the next q value + continue + elif len(allowable) == 1: + # we found exactly one match, so this is our content type to use + return allowable[0] + else: + # we found multiple supported content types at this q value, so now we need to choose the server's + # preference + for i in range(len(server)): + # iterate through the server explicitly by numerical position + if server[i] in allowable: + # when we find our first content type in the allowable list, it is the highest ranked server content + # type that is allowable, so this is our type + return server[i] + + # we've got to here without returning anything, which means that the client and server can't come to + # an agreement on what content type they want and can deliver. There's nothing more we can do! + return None + + def negotiate(self, dict): + """ + Main method for carrying out content negotiation over the supplied HTTP headers dictionary. + Returns either the preferred ContentType as per the settings of the object, or None if no agreement could be + reached + """ + ssslog.debug("Fallback parameters are Accept: " + str(self.default_type) + "/" + str(self.default_subtype) + + ";" + str(self.default_params) + " and Accept-Packaging: " + str(self.default_packaging)) + + # get the accept header if available + accept = self.get_accept(dict) + packaging = self.get_packaging(dict) + ssslog.debug("Accept Header: " + str(accept)) + ssslog.debug("Packaging: "+ str(packaging)) + + if accept is None and packaging is None: + # if it is not available just return the defaults + return ContentType(self.default_type, self.default_subtype, self.default_params, self.default_packaging) + + if packaging is None: + packaging = self.default_packaging + + if accept is None: + accept = self.default_type + "/" + self.default_subtype + if self.default_params is not None: + accept += ";" + self.default_params + + ssslog.debug("Negotiating on Accept: " + str(accept) + " and Accept-Packaging: " + str(packaging)) + + # get us back a dictionary keyed by q value which tells us the order of preference that the client has + # requested + analysed = self.analyse_accept(accept, packaging) +... [truncated message content] |