From: <ku...@us...> - 2009-02-03 20:55:05
|
Revision: 322 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=322&view=rev Author: kurtjx Date: 2009-02-03 20:55:00 +0000 (Tue, 03 Feb 2009) Log Message: ----------- lil python script for importing old myrdfspace data into 3 store, need to add a function to break long queries in two Added Paths: ----------- graphRDF/branches/old2sparul/old2sparul.py Added: graphRDF/branches/old2sparul/old2sparul.py =================================================================== --- graphRDF/branches/old2sparul/old2sparul.py (rev 0) +++ graphRDF/branches/old2sparul/old2sparul.py 2009-02-03 20:55:00 UTC (rev 322) @@ -0,0 +1,216 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +old2sparul.py + +Created by Kurtis Random on 2009-02-03. +Copyright (c) 2009 __MyCompanyName__. All rights reserved. +""" + +import sys +import getopt +from logging import log, error, warning, info, debug +import logging +import ftplib +#from SPARQLWrapper import SPARQLWrapper +import SPARQLWrapper +import mopy +import urllib2 +from time import sleep + +help_message = ''' +take old myrdfspace files and add to the sparql endpoint... + -b --base <uri base from myrdfspace> +''' +failedList = [] +badQueryList = [] + +defaultGraph = "http://dbtune.org/myspace-test" +sparqlEndPoint = "http://dbtune.org/cmn/sparql" +myspaceBase = "http://dbtune.org/myspace/uid" +myspaceOnt = "http://purl.org/ontology/myspace" +prefixes = """PREFIX owl: <http://www.w3.org/2002/07/owl#> \nPREFIX foaf: <http://xmlns.com/foaf/0.1/> \nPREFIX dc: <http://purl.org/dc/elements/1.1/> \nPREFIX mo: <http://purl.org/ontology/mo/>\nPREFIX myspace: <http://purl.org/ontology/myspace#>\nPREFIX xsd: <http://www.w3.org/2001/XMLSchema#>""" + +class Usage(Exception): + def __init__(self, msg): + self.msg = msg + +def parseRDF(filename, base): + '''parse the rdf and return a sparql update query''' + sparqlU = prefixes+""" \ninsert into graph <"""+defaultGraph+"""> {""" + mi = mopy.importRDFFile(base+filename) + keys = mi.PersonIdx.keys() + for key in keys: + person = mi.PersonIdx[key] + if person.name: + # if we find the name, this is the main subject + suid = person.URI.split(base)[1] + subject = "<"+myspaceBase+"/"+suid+">" + name = person.name.pop() + sparqlU = sparqlU + """\n"""+subject+' foaf:name "' + urllib2.quote(name)+'"@en . ' + + # get all the top friends + while(1): + try: + p = person.knows.pop() + ouid = p.URI.split(base)[1] + obj = "<"+myspaceBase+"/"+ouid+">" + sparqlU=sparqlU+ "\n"+subject+" foaf:knows "+ obj+ ' . ' "\n"+subject+" myspace:topFriend "+obj+ ' . ' + except: + break + + while(1): + try: + thm = person.theme.pop() + genre = "<"+myspaceOnt + "#"+urllib2.quote(thm.URI.split(base)[1])+">" + sparqlU=sparqlU+ "\n"+subject+ " myspace:genreTag "+ genre+ ' . ' + except: + break + + try: + playcount = person.tipjar.pop().URI.split(base)[1] + sparqlU=sparqlU+ "\n"+subject+ ' myspace:totalPlays "'+ playcount+'"^^xsd:int . ' + except: + pass + + sparqlU=sparqlU+'}' + return sparqlU + +def setLogger(): + '''just set the logger''' + loggingConfig = {"format":'%(asctime)s %(levelname)-8s %(message)s', + "datefmt":'%d.%m.%y %H:%M:%S', + "level": logging.DEBUG, + #"filename":logPath + "musicGrabber.log", + "filemode":"w"} + logging.basicConfig(**loggingConfig) + +def getFileListing(rdfFolder): + '''return a list of all the rdf files found w/ given base''' + rdfFolder = rdfFolder.rstrip('/') + rdfFolder = rdfFolder+'/' + ftp = ftplib.FTP("myrdfspace.com") + ftp.login("myrdf", "my1stRDF") + ftp.cwd("myrdfspace.com/"+rdfFolder) + vList = ftp.nlst() + return vList + +def trySparql(sparql, attempt, f): + try: + debug('attempting sparql update, try #' + str(attempt)) + sparql.setReturnFormat(SPARQLWrapper.TURTLE) + ret = sparql.query() + print ret.convert() + except urllib2.HTTPError: + debug('caught an http error, retrying...') + if attempt<5: + attempt+=1 + sleep(2) + trySparql(sparql, attempt, f) + else: + error("more that 5 http errors, giving up") + failedList.append(f) + except SPARQLWrapper.sparqlexceptions.QueryBadFormed: + error("query failed for "+ str(f)) + debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$') + badQueryList.append(f) + failedList.append(f) + except: + error("query failed for "+ str(f)) + debug('************UPDATE FAILED***********') + failedList.append(f) + error("Unexpected error:", sys.exc_info()[0]) + +def splitQuery(query): + '''sometime the query is too long and should be broke in two pieces''' + pass + +def main(argv=None): + if argv is None: + argv = sys.argv + try: + try: + opts, args = getopt.getopt(argv[1:], "ho:b:v", ["help", "output=","base="]) + except getopt.error, msg: + raise Usage(msg) + + # option processing + base = None + for option, value in opts: + if option == "-v": + verbose = True + if option in ("-h", "--help"): + raise Usage(help_message) + if option in ("-o", "--output"): + output = value + if option in ("-b", "--base"): + base = value + + setLogger() + if base == None: + raise Usage(help_message) + return 2 + # parse base uri + folder = base.split("http://myrdfspace.com/")[1] + debug('getting list of files') + #fileList = getFileListing(folder) + debug('got list of files') + fileList = ['238729309.rdf', '13280592.rdf', '26412401.rdf', '8557307.rdf', '176635064.rdf', '12656647.rdf'] + for f in fileList: + debug('parsing on file: '+str(f)) + #parse each file and do a sparql update to the repository + sparul = parseRDF(f, base) + sparql = SPARQLWrapper.SPARQLWrapper(sparqlEndPoint) + sparql.addDefaultGraph(defaultGraph) + sparql.setQuery(sparul) + trySparql(sparql, 0, f) + '''try: + debug('attempting sparql update') + sparql.setReturnFormat(SPARQLWrapper.TURTLE) + ret = sparql.query() + print ret.convert() + except urllib2.HTTPError: + debug('caught an http error, retrying...') + try: + ret = sparql.query() + print ret.convert() + except urllib2.HTTPError: + debug('second http error...') + try: + ret = sparql.query() + print ret.convert() + except: + print "query failed for "+ str(f) + debug('************UPDATE FAILED***********') + failedList.append(f) + print "FINAL error:", sys.exc_info()[0] + except: + print "query failed for "+ str(f) + debug('************UPDATE FAILED***********') + failedList.append(f) + print "Unexpected error:", sys.exc_info()[0] + except SPARQLWrapper.sparqlexceptions.QueryBadFormed: + debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$') + badQueryList.append(f) + except: + print "query failed for "+ str(f) + debug('************UPDATE FAILED***********') + failedList.append(f) + print "Unexpected error:", sys.exc_info()[0]''' + + + debug("Complete!!!") + print "\n\nREPORT:\n\tfailures: "+str(len(failedList)) + print "\nfails: " + print failedList + print "\n\nbad queries: " + print badQueryList + + except Usage, err: + print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) + print >> sys.stderr, "\t for help use --help" + return 2 + + +if __name__ == "__main__": + sys.exit(main()) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |