From: <ku...@us...> - 2009-06-22 14:08:14
|
Revision: 354 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=354&view=rev Author: kurtjx Date: 2009-06-22 14:08:10 +0000 (Mon, 22 Jun 2009) Log Message: ----------- some fixing of the old graphs - adding data to sparql point Added Paths: ----------- graphRDF/branches/old2sparul/src/ graphRDF/branches/old2sparul/src/addTotalFriends.py graphRDF/branches/old2sparul/src/old2sparul.py Removed Paths: ------------- graphRDF/branches/old2sparul/old2sparul.py Deleted: graphRDF/branches/old2sparul/old2sparul.py =================================================================== --- graphRDF/branches/old2sparul/old2sparul.py 2009-05-27 21:19:59 UTC (rev 353) +++ graphRDF/branches/old2sparul/old2sparul.py 2009-06-22 14:08:10 UTC (rev 354) @@ -1,264 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -""" -old2sparul.py - -This is an ad hoc script for taking data from myrdfspace.com, cleaning it, and putting in sparql endpoint - -Created by Kurtis Random on 2009-02-03. -Copyright (c) 2009 C4DM QMUL. All rights reserved. -""" - -import sys -import getopt -from logging import log, error, warning, info, debug -import logging -import ftplib -import SPARQLWrapper -import mopy -import urllib2 -import re -from time import sleep - -help_message = ''' -take old myrdfspace files and add to the sparql endpoint... - -b --base <uri base from myrdfspace> - -s --start <uid to start from> useful after a crash ;-) -''' - -failedList = [] -badQueryList = [] - -defaultGraph = "http://dbtune.org/myspace-fj-2008" -sparqlEndPoint = "http://dbtune.org/cmn/sparql" -myspaceBase = "http://dbtune.org/myspace/uid" -myspaceOnt = "http://purl.org/ontology/myspace" -prefixes = """PREFIX owl: <http://www.w3.org/2002/07/owl#> \nPREFIX foaf: <http://xmlns.com/foaf/0.1/> \nPREFIX dc: <http://purl.org/dc/elements/1.1/> \nPREFIX mo: <http://purl.org/ontology/mo/>\nPREFIX myspace: <http://purl.org/ontology/myspace#>\nPREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\nPREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>""" - -insert = """ \ninsert into graph <"""+defaultGraph+"""> {""" - -apacheLimit = 2000 - -class Usage(Exception): - def __init__(self, msg): - self.msg = msg - -def tryImportRDF(filename, attempt): - if attempt < 5: - debug("importing rdf") - try: - mi = mopy.importRDFFile(filename) - except urllib2.URLError: - debug("URLError importing RDF, retrying") - sleep(1.0) - attempt+=1 - tryImportRDF(filename, attempt) - else: - return mi - debug("import failed after tries: " + str(attempt)) - return None - -def parseRDF(filename, base): - '''parse the rdf and return a sparql update query''' - sparqlU='' - mi = tryImportRDF(base+filename, 0) - if mi: - keys = mi.PersonIdx.keys() - for key in keys: - person = mi.PersonIdx[key] - if person.name: - # if we find the name, this is the main subject - suid = person.URI.split(base)[1] - subject = "<"+myspaceBase+"/"+suid+">" - name = person.name.pop() - sparqlU = sparqlU + '\n'+subject+' rdf:type mo:MusicArtist .' - sparqlU = sparqlU + '\n'+subject+' myspace:myspaceID "'+filename.rstrip('.rdf')+'"^^xsd:int .' - sparqlU = sparqlU + """\n"""+subject+' foaf:name "' + urllib2.quote(name)+'"@en . ' - - # get all the top friends - while(1): - try: - p = person.knows.pop() - except: - break - else: - ouid = p.URI.split(base)[1] - obj = "<"+myspaceBase+"/"+ouid+">" - sparqlU=sparqlU+ "\n"+subject+" foaf:knows "+ obj+ ' . ' "\n"+subject+" myspace:topFriend "+obj+ ' . ' - sparqlU = sparqlU + '\n'+obj+' rdf:type mo:MusicArtist .' - - while(1): - try: - thm = person.theme.pop() - except: - debug("breaking from genre pops") - break - else: - thm = thm.URI.split(base)[1] - # do some cleaning, bad genres in there like 35123543.rdf instead of hip hop - if not re.match(".*\.rdf",thm): - debug("adding genre: "+thm) - genre = "<"+myspaceOnt + "#"+urllib2.quote(thm)+">" - sparqlU=sparqlU+ "\n"+subject+ " myspace:genreTag "+ genre+ ' . ' - - try: - playcount = person.tipjar.pop().URI.split(base)[1] - sparqlU=sparqlU+ "\n"+subject+ ' myspace:totalPlays "'+ playcount+'"^^xsd:int . ' - except: - pass - - sparqlU=sparqlU+'}' - return sparqlU - else: - return None - -def setLogger(): - '''just set the logger''' - loggingConfig = {"format":'%(asctime)s %(levelname)-8s %(message)s', - "datefmt":'%d.%m.%y %H:%M:%S', - "level": logging.DEBUG, - #"filename":logPath + "musicGrabber.log", - "filemode":"w"} - logging.basicConfig(**loggingConfig) - -def getFileListing(rdfFolder): - '''return a list of all the rdf files found w/ given base''' - rdfFolder = rdfFolder.rstrip('/') - rdfFolder = rdfFolder+'/' - ftp = ftplib.FTP("myrdfspace.com") - ftp.login("myrdf", "my1stRDF") - ftp.cwd("myrdfspace.com/"+rdfFolder) - vList = ftp.nlst() - return vList - -def trySparql(sparql, attempt, f): - try: - debug('attempting sparql update, try #' + str(attempt)) - sparql.setReturnFormat(SPARQLWrapper.TURTLE) - ret = sparql.query().convert() - except urllib2.HTTPError: - debug('caught an http error, retrying...') - if attempt<5: - attempt+=1 - sleep(2) - trySparql(sparql, attempt, f) - else: - error("more that 5 http errors, giving up") - failedList.append(f) - except SPARQLWrapper.sparqlexceptions.QueryBadFormed: - error("query failed for "+ str(f)) - debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$') - print sparql.queryString - badQueryList.append(f) - failedList.append(f) - except: - error("query failed for "+ str(f)) - debug('************UPDATE FAILED***********') - failedList.append(f) - print "Unexpected error:", sys.exc_info()[0] - print sparql.queryString - else: - print ret - return ret - return None - -def splitQuery(query): - '''sometime the query is too long and should be broke in two pieces''' - lines = query.splitlines(1) - splits = [] - split = "" - count = 0 - for line in lines: - if count < apacheLimit: - split = split+line - count+=len(line) - else: - splits.append(insert+split+'}') - split= line - count = 0 - splits.append(insert+split) - return splits - -def main(argv=None): - if argv is None: - argv = sys.argv - try: - try: - opts, args = getopt.getopt(argv[1:], "ho:b:s:v", ["help", "output=","base=", "start="]) - except getopt.error, msg: - raise Usage(msg) - - # option processing - base = None - start = None - for option, value in opts: - if option == "-v": - verbose = True - if option in ("-h", "--help"): - raise Usage(help_message) - if option in ("-o", "--output"): - output = value - if option in ("-b", "--base"): - base = value - if option in ("-s", "--start"): - start = value - '''if option in ("-g", '--graph'): - defaultGraph = value - insert = """ \ninsert into graph <"""+defaultGraph+"""> {"""''' - - - setLogger() - if base == None: - raise Usage(help_message) - return 2 - # parse base uri - folder = base.split("http://myrdfspace.com/")[1] - debug('getting list of files') - fileList = getFileListing(folder) - debug('got list of files') - #fileList = ['238729309.rdf', '13280592.rdf', '26412401.rdf', '8557307.rdf', '176635064.rdf', '12656647.rdf'] - startIndex=0 - if start: - try: - startIndex=fileList.index(start) - except: - debug("not a valid start file, not in list") - - for f in fileList[startIndex:]: - debug('parsing on file: '+str(f)) - #parse each file and do a sparql update to the repository - sparul = parseRDF(f, base) - sparql = SPARQLWrapper.SPARQLWrapper(sparqlEndPoint) - sparql.addDefaultGraph(defaultGraph) - if sparul: - # we have to deal w/ queries that are too long - if len(sparul) > apacheLimit: - debug('query too long, splitting...') - splitSparul = splitQuery(sparul) - for split in splitSparul: - sparql.setQuery(prefixes+split) - trySparql(sparql, 0, f) - else: - sparql.setQuery(prefixes+insert+sparul) - trySparql(sparql, 0, f) - else: - debug('failure on '+str(f)) - failedList.append(f) - - - - debug("Complete!!!") - print "\n\nREPORT:\n\tfailures: "+str(len(failedList)) - print "\nfails: " - print failedList - print "\n\nbad queries: " - print badQueryList - - except Usage, err: - print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) - print >> sys.stderr, "\t for help use --help" - return 2 - - -if __name__ == "__main__": - sys.exit(main()) Added: graphRDF/branches/old2sparul/src/addTotalFriends.py =================================================================== --- graphRDF/branches/old2sparul/src/addTotalFriends.py (rev 0) +++ graphRDF/branches/old2sparul/src/addTotalFriends.py 2009-06-22 14:08:10 UTC (rev 354) @@ -0,0 +1,160 @@ +#!/usr/bin/python +''' +Created on Jun 19, 2009 + +@author: kurtjx +''' +import SPARQLWrapper +from rdflib import ConjunctiveGraph, URIRef, Literal, Namespace +import logging +from logging import debug, error, info +from time import sleep +import sys +import urllib2 + + +DEFAULT_GRAPH = "http://dbtune.org/myspace-fj-2008" +ENDPOINT = "http://virtuoso.dbtune.org/sparql" + +MYSPACE = Namespace("http://purl.org/ontology/myspace#") + +totfri_fail_list = [] +country_fail_list = [] +local_fail_list = [] + +def get_some_artists(sparql, limit=500, offset=0): + debug('querying for artists with limit %s and offset %s' % (str(limit), str(offset)) ) + q = 'define sql:log-enable 2 SELECT DISTINCT ?artist FROM <%s> WHERE { ?artist a <http://purl.org/ontology/mo/MusicArtist> } LIMIT %s OFFSET %s' % (DEFAULT_GRAPH, str(limit), str(offset)) + sparql.setQuery(q) + results = try_sparql(sparql, 0, 5) + if results == None: + error('freaking out, no results in last query: %s' % q) + print fail_list + sys.exit(2) + else: + debug('creating local graph and parsing results...') + graph = ConjunctiveGraph() + for result in results['results']['bindings']: + uri = result['artist']['value'] + #debug('creating new cursor') + #cursor = CONNECT.cursor() + #print('inserting triples for %s' % uri) + debug('getting total friends for %s' % uri) + results = get_total_friends(uri) + #q = 'SPARQL define sql:log-enable 2 INSERT IN GRAPH <'+DEFAULT_GRAPH+'> { <'+uri+'> <http://purl.org/ontology/myspace#totalFriends> "'+totfri+'"^^xsd:int } ' + #q = "DB.DBA.TTLP_MT('<%s> <http://purl.org/ontology/myspace#totalFriends> %s . ', '', '%s') " % (uri, totfri, DEFAULT_GRAPH) + #print q + #cursor.execute(q) + #cursor.close() + #debug('cursor closed') + if results['totalFriends'] != None: + graph.add((URIRef(uri), MYSPACE['totalFriends'], Literal(int(results['totalFriends'])))) + else: + debug('!!!!!!!!!!!!!!!!!!!!!!! no friends for %s !!!!!!!!!!!!!!!!!!!!!!' % uri) + totfri_fail_list.append(uri+'\n') + + if results['locality'] != None: + graph.add((URIRef(uri), MYSPACE['locality'], Literal(results['locality']))) + else: + debug('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ no locality for %s @@@@@@@@@@@@@@@@ ' % uri) + local_fail_list.append(uri+'\n') + + if results['country'] != None: + graph.add((URIRef(uri), MYSPACE['country'], Literal(results['country']))) + else: + debug('******************************** no country for %s ***************************' % uri) + country_fail_list.append(uri+'\n') + + + debug('serializing results') + graph.serialize('./rdf/'+str(offset)+'.rdf') + f = open('./log/'+str(offset)+'_totfri_errors', 'w') + f.writelines(totfri_fail_list) + f.close() + f = open('./log/'+str(offset)+'_locality_errors', 'w') + f.writelines(local_fail_list) + f.close() + f = open('./log/'+str(offset)+'_country_errors', 'w') + f.writelines(country_fail_list) + f.close() + + debug('done with result set') + offset += limit + get_some_artists(sparql, limit, offset) + + +def get_total_friends(uri, attempt=0, fail=5): + '''get the total friends from the give uri''' + graph = ConjunctiveGraph() + results = {} + try: + graph.parse(uri) + except urllib2.HTTPError: + if attempt<fail: + attempt+=1 + sleep(2) + get_total_friends(uri, attempt, fail) + else: + return None + for row in graph.query('select ?totfri where { ?x <http://purl.org/ontology/myspace#totalFriends> ?totfri . } ' ): + totfri = row[0] + try: + totfri = totfri.strip('http://dbtune.org/myspace/uid/') + except: + results['totalFriends'] = None # exception means we didn't find any friends + else: + results['totalFriends'] = totfri + local = None + for row in graph.query('select ?local where { ?x <http://purl.org/ontology/myspace#locality> ?local . } '): + local = row[0] + results['locality'] = local + country = None + for row in graph.query('select ?country where { ?x <http://purl.org/ontology/myspace#country> ?country . } '): + country = row[0] + results['country'] = country + return results + + +def insert_total_friends(uri, cursor): + totfri = get_total_friends(uri) + q = 'SPARQL define sql:log-enable 2 INSERT IN GRAPH <%s> { <%s> <http://purl.org/ontology/myspace#totalFriends> "%s"^^xsd:int } ' % (DEFAULT_GRAPH, uri, totfri) + #print q + cursor.execute(q) + +def try_sparql(sparql, attempt=0, fail=5): + try: + debug('attempting sparql query, try #' + str(attempt)) + sparql.setReturnFormat(SPARQLWrapper.JSON) + ret = sparql.query().convert() + except urllib2.HTTPError: + debug('caught an http error, retrying...') + if attempt<fail: + attempt+=1 + sleep(2) + trySparql(sparql, attempt, fail) + else: + error("more that 5 http errors, giving up") + return None + return ret + +def set_logger(level = logging.DEBUG): + '''just set the logger''' + loggingConfig = {"format":'%(asctime)s %(levelname)-8s %(message)s', + "datefmt":'%d.%m.%y %H:%M:%S', + "level": level, + #"filename":logPath + "musicGrabber.log", + "filemode":"w"} + logging.basicConfig(**loggingConfig) + +def main(): + set_logger() + sparql = SPARQLWrapper.SPARQLWrapper(ENDPOINT) + sparql.setReturnFormat(SPARQLWrapper.JSON) + #CONNECT = pyodbc.connect('DSN=SysVirt;UID=dba;PWD=dba;HOST=localhost:1112') + get_some_artists(sparql, 500,0) + + + +if __name__ == '__main__': + main() + \ No newline at end of file Copied: graphRDF/branches/old2sparul/src/old2sparul.py (from rev 353, graphRDF/branches/old2sparul/old2sparul.py) =================================================================== --- graphRDF/branches/old2sparul/src/old2sparul.py (rev 0) +++ graphRDF/branches/old2sparul/src/old2sparul.py 2009-06-22 14:08:10 UTC (rev 354) @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +old2sparul.py + +This is an ad hoc script for taking data from myrdfspace.com, cleaning it, and putting in sparql endpoint + +Created by Kurtis Random on 2009-02-03. +Copyright (c) 2009 C4DM QMUL. All rights reserved. +""" + +import sys +import getopt +from logging import log, error, warning, info, debug +import logging +import ftplib +import SPARQLWrapper +import mopy +import urllib2 +import re +from time import sleep + +help_message = ''' +take old myrdfspace files and add to the sparql endpoint... + -b --base <uri base from myrdfspace> + -s --start <uid to start from> useful after a crash ;-) +''' + +failedList = [] +badQueryList = [] + +defaultGraph = "http://dbtune.org/myspace-fj-2008" +sparqlEndPoint = "http://dbtune.org/cmn/sparql" +myspaceBase = "http://dbtune.org/myspace/uid" +myspaceOnt = "http://purl.org/ontology/myspace" +prefixes = """PREFIX owl: <http://www.w3.org/2002/07/owl#> \nPREFIX foaf: <http://xmlns.com/foaf/0.1/> \nPREFIX dc: <http://purl.org/dc/elements/1.1/> \nPREFIX mo: <http://purl.org/ontology/mo/>\nPREFIX myspace: <http://purl.org/ontology/myspace#>\nPREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\nPREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>""" + +insert = """ \ninsert into graph <"""+defaultGraph+"""> {""" + +apacheLimit = 2000 + +class Usage(Exception): + def __init__(self, msg): + self.msg = msg + +def tryImportRDF(filename, attempt): + if attempt < 5: + debug("importing rdf") + try: + mi = mopy.importRDFFile(filename) + except urllib2.URLError: + debug("URLError importing RDF, retrying") + sleep(1.0) + attempt+=1 + tryImportRDF(filename, attempt) + else: + return mi + debug("import failed after tries: " + str(attempt)) + return None + +def parseRDF(filename, base): + '''parse the rdf and return a sparql update query''' + sparqlU='' + mi = tryImportRDF(base+filename, 0) + if mi: + keys = mi.PersonIdx.keys() + for key in keys: + person = mi.PersonIdx[key] + if person.name: + # if we find the name, this is the main subject + suid = person.URI.split(base)[1] + subject = "<"+myspaceBase+"/"+suid+">" + name = person.name.pop() + sparqlU = sparqlU + '\n'+subject+' rdf:type mo:MusicArtist .' + sparqlU = sparqlU + '\n'+subject+' myspace:myspaceID "'+filename.rstrip('.rdf')+'"^^xsd:int .' + sparqlU = sparqlU + """\n"""+subject+' foaf:name "' + urllib2.quote(name)+'"@en . ' + + # get all the top friends + while(1): + try: + p = person.knows.pop() + except: + break + else: + ouid = p.URI.split(base)[1] + obj = "<"+myspaceBase+"/"+ouid+">" + sparqlU=sparqlU+ "\n"+subject+" foaf:knows "+ obj+ ' . ' "\n"+subject+" myspace:topFriend "+obj+ ' . ' + sparqlU = sparqlU + '\n'+obj+' rdf:type mo:MusicArtist .' + + while(1): + try: + thm = person.theme.pop() + except: + debug("breaking from genre pops") + break + else: + thm = thm.URI.split(base)[1] + # do some cleaning, bad genres in there like 35123543.rdf instead of hip hop + if not re.match(".*\.rdf",thm): + debug("adding genre: "+thm) + genre = "<"+myspaceOnt + "#"+urllib2.quote(thm)+">" + sparqlU=sparqlU+ "\n"+subject+ " myspace:genreTag "+ genre+ ' . ' + + try: + playcount = person.tipjar.pop().URI.split(base)[1] + sparqlU=sparqlU+ "\n"+subject+ ' myspace:totalPlays "'+ playcount+'"^^xsd:int . ' + except: + pass + + sparqlU=sparqlU+'}' + return sparqlU + else: + return None + +def setLogger(): + '''just set the logger''' + loggingConfig = {"format":'%(asctime)s %(levelname)-8s %(message)s', + "datefmt":'%d.%m.%y %H:%M:%S', + "level": logging.DEBUG, + #"filename":logPath + "musicGrabber.log", + "filemode":"w"} + logging.basicConfig(**loggingConfig) + +def getFileListing(rdfFolder): + '''return a list of all the rdf files found w/ given base''' + rdfFolder = rdfFolder.rstrip('/') + rdfFolder = rdfFolder+'/' + ftp = ftplib.FTP("myrdfspace.com") + ftp.login("myrdf", "my1stRDF") + ftp.cwd("myrdfspace.com/"+rdfFolder) + vList = ftp.nlst() + return vList + +def trySparql(sparql, attempt, f): + try: + debug('attempting sparql update, try #' + str(attempt)) + sparql.setReturnFormat(SPARQLWrapper.TURTLE) + ret = sparql.query().convert() + except urllib2.HTTPError: + debug('caught an http error, retrying...') + if attempt<5: + attempt+=1 + sleep(2) + trySparql(sparql, attempt, f) + else: + error("more that 5 http errors, giving up") + failedList.append(f) + except SPARQLWrapper.sparqlexceptions.QueryBadFormed: + error("query failed for "+ str(f)) + debug('$$$$$$$$$$$$$$$$BADLY FORMED QUERY$$$$$$$$$$$$$$$$$$$') + print sparql.queryString + badQueryList.append(f) + failedList.append(f) + except: + error("query failed for "+ str(f)) + debug('************UPDATE FAILED***********') + failedList.append(f) + print "Unexpected error:", sys.exc_info()[0] + print sparql.queryString + else: + print ret + return ret + return None + +def splitQuery(query): + '''sometime the query is too long and should be broke in two pieces''' + lines = query.splitlines(1) + splits = [] + split = "" + count = 0 + for line in lines: + if count < apacheLimit: + split = split+line + count+=len(line) + else: + splits.append(insert+split+'}') + split= line + count = 0 + splits.append(insert+split) + return splits + +def main(argv=None): + if argv is None: + argv = sys.argv + try: + try: + opts, args = getopt.getopt(argv[1:], "ho:b:s:v", ["help", "output=","base=", "start="]) + except getopt.error, msg: + raise Usage(msg) + + # option processing + base = None + start = None + for option, value in opts: + if option == "-v": + verbose = True + if option in ("-h", "--help"): + raise Usage(help_message) + if option in ("-o", "--output"): + output = value + if option in ("-b", "--base"): + base = value + if option in ("-s", "--start"): + start = value + '''if option in ("-g", '--graph'): + defaultGraph = value + insert = """ \ninsert into graph <"""+defaultGraph+"""> {"""''' + + + setLogger() + if base == None: + raise Usage(help_message) + return 2 + # parse base uri + folder = base.split("http://myrdfspace.com/")[1] + debug('getting list of files') + fileList = getFileListing(folder) + debug('got list of files') + #fileList = ['238729309.rdf', '13280592.rdf', '26412401.rdf', '8557307.rdf', '176635064.rdf', '12656647.rdf'] + startIndex=0 + if start: + try: + startIndex=fileList.index(start) + except: + debug("not a valid start file, not in list") + + for f in fileList[startIndex:]: + debug('parsing on file: '+str(f)) + #parse each file and do a sparql update to the repository + sparul = parseRDF(f, base) + sparql = SPARQLWrapper.SPARQLWrapper(sparqlEndPoint) + sparql.addDefaultGraph(defaultGraph) + if sparul: + # we have to deal w/ queries that are too long + if len(sparul) > apacheLimit: + debug('query too long, splitting...') + splitSparul = splitQuery(sparul) + for split in splitSparul: + sparql.setQuery(prefixes+split) + trySparql(sparql, 0, f) + else: + sparql.setQuery(prefixes+insert+sparul) + trySparql(sparql, 0, f) + else: + debug('failure on '+str(f)) + failedList.append(f) + + + + debug("Complete!!!") + print "\n\nREPORT:\n\tfailures: "+str(len(failedList)) + print "\nfails: " + print failedList + print "\n\nbad queries: " + print badQueryList + + except Usage, err: + print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) + print >> sys.stderr, "\t for help use --help" + return 2 + + +if __name__ == "__main__": + sys.exit(main()) Property changes on: graphRDF/branches/old2sparul/src/old2sparul.py ___________________________________________________________________ Added: svn:mergeinfo + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |