From: <ku...@us...> - 2009-02-04 15:18:11
|
Revision: 323 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=323&view=rev Author: kurtjx Date: 2009-02-04 15:18:08 +0000 (Wed, 04 Feb 2009) Log Message: ----------- splits big queries down now Modified Paths: -------------- graphRDF/branches/old2sparul/old2sparul.py Modified: graphRDF/branches/old2sparul/old2sparul.py =================================================================== --- graphRDF/branches/old2sparul/old2sparul.py 2009-02-03 20:55:00 UTC (rev 322) +++ graphRDF/branches/old2sparul/old2sparul.py 2009-02-04 15:18:08 UTC (rev 323) @@ -22,22 +22,27 @@ take old myrdfspace files and add to the sparql endpoint... -b --base <uri base from myrdfspace> ''' + failedList = [] badQueryList = [] -defaultGraph = "http://dbtune.org/myspace-test" +defaultGraph = "http://dbtune.org/myspace-fj-2008p" sparqlEndPoint = "http://dbtune.org/cmn/sparql" myspaceBase = "http://dbtune.org/myspace/uid" myspaceOnt = "http://purl.org/ontology/myspace" prefixes = """PREFIX owl: <http://www.w3.org/2002/07/owl#> \nPREFIX foaf: <http://xmlns.com/foaf/0.1/> \nPREFIX dc: <http://purl.org/dc/elements/1.1/> \nPREFIX mo: <http://purl.org/ontology/mo/>\nPREFIX myspace: <http://purl.org/ontology/myspace#>\nPREFIX xsd: <http://www.w3.org/2001/XMLSchema#>""" +insert = """ \ninsert into graph <"""+defaultGraph+"""> {""" + +apacheLimit = 2000 + class Usage(Exception): def __init__(self, msg): self.msg = msg def parseRDF(filename, base): '''parse the rdf and return a sparql update query''' - sparqlU = prefixes+""" \ninsert into graph <"""+defaultGraph+"""> {""" + sparqlU='' mi = mopy.importRDFFile(base+filename) keys = mi.PersonIdx.keys() for key in keys: @@ -99,8 +104,7 @@ try: debug('attempting sparql update, try #' + str(attempt)) sparql.setReturnFormat(SPARQLWrapper.TURTLE) - ret = sparql.query() - print ret.convert() + ret = sparql.query().convert() except urllib2.HTTPError: debug('caught an http error, retrying...') if attempt<5: @@ -113,17 +117,36 @@ except SPARQLWrapper.sparqlexceptions.QueryBadFormed: error("query failed for "+ str(f)) debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$') + print sparql.queryString badQueryList.append(f) failedList.append(f) except: error("query failed for "+ str(f)) debug('************UPDATE FAILED***********') failedList.append(f) - error("Unexpected error:", sys.exc_info()[0]) + print "Unexpected error:", sys.exc_info()[0] + print sparql.queryString + else: + print ret + return ret + return None def splitQuery(query): '''sometime the query is too long and should be broke in two pieces''' - pass + lines = query.splitlines(1) + splits = [] + split = "" + count = 0 + for line in lines: + if count < apacheLimit: + split = split+line + count+=len(line) + else: + splits.append(insert+split+'}') + split= line + count = 0 + splits.append(insert+split) + return splits def main(argv=None): if argv is None: @@ -145,6 +168,10 @@ output = value if option in ("-b", "--base"): base = value + '''if option in ("-g", '--graph'): + defaultGraph = value + insert = """ \ninsert into graph <"""+defaultGraph+"""> {"""''' + setLogger() if base == None: @@ -153,50 +180,27 @@ # parse base uri folder = base.split("http://myrdfspace.com/")[1] debug('getting list of files') - #fileList = getFileListing(folder) + fileList = getFileListing(folder) debug('got list of files') - fileList = ['238729309.rdf', '13280592.rdf', '26412401.rdf', '8557307.rdf', '176635064.rdf', '12656647.rdf'] + #fileList = ['238729309.rdf', '13280592.rdf', '26412401.rdf', '8557307.rdf', '176635064.rdf', '12656647.rdf'] for f in fileList: debug('parsing on file: '+str(f)) #parse each file and do a sparql update to the repository sparul = parseRDF(f, base) sparql = SPARQLWrapper.SPARQLWrapper(sparqlEndPoint) sparql.addDefaultGraph(defaultGraph) - sparql.setQuery(sparul) - trySparql(sparql, 0, f) - '''try: - debug('attempting sparql update') - sparql.setReturnFormat(SPARQLWrapper.TURTLE) - ret = sparql.query() - print ret.convert() - except urllib2.HTTPError: - debug('caught an http error, retrying...') - try: - ret = sparql.query() - print ret.convert() - except urllib2.HTTPError: - debug('second http error...') - try: - ret = sparql.query() - print ret.convert() - except: - print "query failed for "+ str(f) - debug('************UPDATE FAILED***********') - failedList.append(f) - print "FINAL error:", sys.exc_info()[0] - except: - print "query failed for "+ str(f) - debug('************UPDATE FAILED***********') - failedList.append(f) - print "Unexpected error:", sys.exc_info()[0] - except SPARQLWrapper.sparqlexceptions.QueryBadFormed: - debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$') - badQueryList.append(f) - except: - print "query failed for "+ str(f) - debug('************UPDATE FAILED***********') - failedList.append(f) - print "Unexpected error:", sys.exc_info()[0]''' + + # we have to deal w/ queries that are too long + if len(sparul) > apacheLimit: + debug('query too long, splitting...') + splitSparul = splitQuery(sparul) + for split in splitSparul: + sparql.setQuery(prefixes+split) + trySparql(sparql, 0, f) + else: + sparql.setQuery(prefixes+insert+sparul) + trySparql(sparql, 0, f) + debug("Complete!!!") This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |