|
From: <ku...@us...> - 2009-02-03 20:55:05
|
Revision: 322
http://mypyspace.svn.sourceforge.net/mypyspace/?rev=322&view=rev
Author: kurtjx
Date: 2009-02-03 20:55:00 +0000 (Tue, 03 Feb 2009)
Log Message:
-----------
lil python script for importing old myrdfspace data into 3 store, need to add a function to break long queries in two
Added Paths:
-----------
graphRDF/branches/old2sparul/old2sparul.py
Added: graphRDF/branches/old2sparul/old2sparul.py
===================================================================
--- graphRDF/branches/old2sparul/old2sparul.py (rev 0)
+++ graphRDF/branches/old2sparul/old2sparul.py 2009-02-03 20:55:00 UTC (rev 322)
@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+old2sparul.py
+
+Created by Kurtis Random on 2009-02-03.
+Copyright (c) 2009 __MyCompanyName__. All rights reserved.
+"""
+
+import sys
+import getopt
+from logging import log, error, warning, info, debug
+import logging
+import ftplib
+#from SPARQLWrapper import SPARQLWrapper
+import SPARQLWrapper
+import mopy
+import urllib2
+from time import sleep
+
+help_message = '''
+take old myrdfspace files and add to the sparql endpoint...
+ -b --base <uri base from myrdfspace>
+'''
+failedList = []
+badQueryList = []
+
+defaultGraph = "http://dbtune.org/myspace-test"
+sparqlEndPoint = "http://dbtune.org/cmn/sparql"
+myspaceBase = "http://dbtune.org/myspace/uid"
+myspaceOnt = "http://purl.org/ontology/myspace"
+prefixes = """PREFIX owl: <http://www.w3.org/2002/07/owl#> \nPREFIX foaf: <http://xmlns.com/foaf/0.1/> \nPREFIX dc: <http://purl.org/dc/elements/1.1/> \nPREFIX mo: <http://purl.org/ontology/mo/>\nPREFIX myspace: <http://purl.org/ontology/myspace#>\nPREFIX xsd: <http://www.w3.org/2001/XMLSchema#>"""
+
+class Usage(Exception):
+ def __init__(self, msg):
+ self.msg = msg
+
+def parseRDF(filename, base):
+ '''parse the rdf and return a sparql update query'''
+ sparqlU = prefixes+""" \ninsert into graph <"""+defaultGraph+"""> {"""
+ mi = mopy.importRDFFile(base+filename)
+ keys = mi.PersonIdx.keys()
+ for key in keys:
+ person = mi.PersonIdx[key]
+ if person.name:
+ # if we find the name, this is the main subject
+ suid = person.URI.split(base)[1]
+ subject = "<"+myspaceBase+"/"+suid+">"
+ name = person.name.pop()
+ sparqlU = sparqlU + """\n"""+subject+' foaf:name "' + urllib2.quote(name)+'"@en . '
+
+ # get all the top friends
+ while(1):
+ try:
+ p = person.knows.pop()
+ ouid = p.URI.split(base)[1]
+ obj = "<"+myspaceBase+"/"+ouid+">"
+ sparqlU=sparqlU+ "\n"+subject+" foaf:knows "+ obj+ ' . ' "\n"+subject+" myspace:topFriend "+obj+ ' . '
+ except:
+ break
+
+ while(1):
+ try:
+ thm = person.theme.pop()
+ genre = "<"+myspaceOnt + "#"+urllib2.quote(thm.URI.split(base)[1])+">"
+ sparqlU=sparqlU+ "\n"+subject+ " myspace:genreTag "+ genre+ ' . '
+ except:
+ break
+
+ try:
+ playcount = person.tipjar.pop().URI.split(base)[1]
+ sparqlU=sparqlU+ "\n"+subject+ ' myspace:totalPlays "'+ playcount+'"^^xsd:int . '
+ except:
+ pass
+
+ sparqlU=sparqlU+'}'
+ return sparqlU
+
+def setLogger():
+ '''just set the logger'''
+ loggingConfig = {"format":'%(asctime)s %(levelname)-8s %(message)s',
+ "datefmt":'%d.%m.%y %H:%M:%S',
+ "level": logging.DEBUG,
+ #"filename":logPath + "musicGrabber.log",
+ "filemode":"w"}
+ logging.basicConfig(**loggingConfig)
+
+def getFileListing(rdfFolder):
+ '''return a list of all the rdf files found w/ given base'''
+ rdfFolder = rdfFolder.rstrip('/')
+ rdfFolder = rdfFolder+'/'
+ ftp = ftplib.FTP("myrdfspace.com")
+ ftp.login("myrdf", "my1stRDF")
+ ftp.cwd("myrdfspace.com/"+rdfFolder)
+ vList = ftp.nlst()
+ return vList
+
+def trySparql(sparql, attempt, f):
+ try:
+ debug('attempting sparql update, try #' + str(attempt))
+ sparql.setReturnFormat(SPARQLWrapper.TURTLE)
+ ret = sparql.query()
+ print ret.convert()
+ except urllib2.HTTPError:
+ debug('caught an http error, retrying...')
+ if attempt<5:
+ attempt+=1
+ sleep(2)
+ trySparql(sparql, attempt, f)
+ else:
+ error("more that 5 http errors, giving up")
+ failedList.append(f)
+ except SPARQLWrapper.sparqlexceptions.QueryBadFormed:
+ error("query failed for "+ str(f))
+ debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$')
+ badQueryList.append(f)
+ failedList.append(f)
+ except:
+ error("query failed for "+ str(f))
+ debug('************UPDATE FAILED***********')
+ failedList.append(f)
+ error("Unexpected error:", sys.exc_info()[0])
+
+def splitQuery(query):
+ '''sometime the query is too long and should be broke in two pieces'''
+ pass
+
+def main(argv=None):
+ if argv is None:
+ argv = sys.argv
+ try:
+ try:
+ opts, args = getopt.getopt(argv[1:], "ho:b:v", ["help", "output=","base="])
+ except getopt.error, msg:
+ raise Usage(msg)
+
+ # option processing
+ base = None
+ for option, value in opts:
+ if option == "-v":
+ verbose = True
+ if option in ("-h", "--help"):
+ raise Usage(help_message)
+ if option in ("-o", "--output"):
+ output = value
+ if option in ("-b", "--base"):
+ base = value
+
+ setLogger()
+ if base == None:
+ raise Usage(help_message)
+ return 2
+ # parse base uri
+ folder = base.split("http://myrdfspace.com/")[1]
+ debug('getting list of files')
+ #fileList = getFileListing(folder)
+ debug('got list of files')
+ fileList = ['238729309.rdf', '13280592.rdf', '26412401.rdf', '8557307.rdf', '176635064.rdf', '12656647.rdf']
+ for f in fileList:
+ debug('parsing on file: '+str(f))
+ #parse each file and do a sparql update to the repository
+ sparul = parseRDF(f, base)
+ sparql = SPARQLWrapper.SPARQLWrapper(sparqlEndPoint)
+ sparql.addDefaultGraph(defaultGraph)
+ sparql.setQuery(sparul)
+ trySparql(sparql, 0, f)
+ '''try:
+ debug('attempting sparql update')
+ sparql.setReturnFormat(SPARQLWrapper.TURTLE)
+ ret = sparql.query()
+ print ret.convert()
+ except urllib2.HTTPError:
+ debug('caught an http error, retrying...')
+ try:
+ ret = sparql.query()
+ print ret.convert()
+ except urllib2.HTTPError:
+ debug('second http error...')
+ try:
+ ret = sparql.query()
+ print ret.convert()
+ except:
+ print "query failed for "+ str(f)
+ debug('************UPDATE FAILED***********')
+ failedList.append(f)
+ print "FINAL error:", sys.exc_info()[0]
+ except:
+ print "query failed for "+ str(f)
+ debug('************UPDATE FAILED***********')
+ failedList.append(f)
+ print "Unexpected error:", sys.exc_info()[0]
+ except SPARQLWrapper.sparqlexceptions.QueryBadFormed:
+ debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$')
+ badQueryList.append(f)
+ except:
+ print "query failed for "+ str(f)
+ debug('************UPDATE FAILED***********')
+ failedList.append(f)
+ print "Unexpected error:", sys.exc_info()[0]'''
+
+
+ debug("Complete!!!")
+ print "\n\nREPORT:\n\tfailures: "+str(len(failedList))
+ print "\nfails: "
+ print failedList
+ print "\n\nbad queries: "
+ print badQueryList
+
+ except Usage, err:
+ print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
+ print >> sys.stderr, "\t for help use --help"
+ return 2
+
+
+if __name__ == "__main__":
+ sys.exit(main())
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|