From: <ku...@us...> - 2009-02-18 12:56:54
|
Revision: 327 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=327&view=rev Author: kurtjx Date: 2009-02-18 12:56:50 +0000 (Wed, 18 Feb 2009) Log Message: ----------- added some regex stuff to get rid of bad genre tags, sometime 1324123.rdf was set as a theme which was a bug in the old code i guess Modified Paths: -------------- graphRDF/branches/old2sparul/old2sparul.py Modified: graphRDF/branches/old2sparul/old2sparul.py =================================================================== --- graphRDF/branches/old2sparul/old2sparul.py 2009-02-18 12:51:29 UTC (rev 326) +++ graphRDF/branches/old2sparul/old2sparul.py 2009-02-18 12:56:50 UTC (rev 327) @@ -3,8 +3,10 @@ """ old2sparul.py +This is an ad hoc script for taking data from myrdfspace.com, cleaning it, and putting in sparql endpoint + Created by Kurtis Random on 2009-02-03. -Copyright (c) 2009 __MyCompanyName__. All rights reserved. +Copyright (c) 2009 C4DM QMUL. All rights reserved. """ import sys @@ -12,21 +14,22 @@ from logging import log, error, warning, info, debug import logging import ftplib -#from SPARQLWrapper import SPARQLWrapper import SPARQLWrapper import mopy import urllib2 +import re from time import sleep help_message = ''' take old myrdfspace files and add to the sparql endpoint... -b --base <uri base from myrdfspace> + -s --start <uid to start from> useful after a crash ;-) ''' failedList = [] badQueryList = [] -defaultGraph = "http://dbtune.org/myspace-fj-set-2008" +defaultGraph = "http://dbtune.org/myspace-fj-2008" sparqlEndPoint = "http://dbtune.org/cmn/sparql" myspaceBase = "http://dbtune.org/myspace/uid" myspaceOnt = "http://purl.org/ontology/myspace" @@ -50,7 +53,8 @@ sleep(1.0) attempt+=1 tryImportRDF(filename, attempt) - return mi + else: + return mi debug("import failed after tries: " + str(attempt)) return None @@ -58,45 +62,55 @@ '''parse the rdf and return a sparql update query''' sparqlU='' mi = tryImportRDF(base+filename, 0) - keys = mi.PersonIdx.keys() - for key in keys: - person = mi.PersonIdx[key] - if person.name: - # if we find the name, this is the main subject - suid = person.URI.split(base)[1] - subject = "<"+myspaceBase+"/"+suid+">" - name = person.name.pop() - sparqlU = sparqlU + '\n'+subject+' rdf:type mo:MusicArtist .' - sparqlU = sparqlU + '\n'+subject+' myspace:myspaceID "'+filename.rstrip('.rdf')+'"^^xsd:int .' - sparqlU = sparqlU + """\n"""+subject+' foaf:name "' + urllib2.quote(name)+'"@en . ' + if mi: + keys = mi.PersonIdx.keys() + for key in keys: + person = mi.PersonIdx[key] + if person.name: + # if we find the name, this is the main subject + suid = person.URI.split(base)[1] + subject = "<"+myspaceBase+"/"+suid+">" + name = person.name.pop() + sparqlU = sparqlU + '\n'+subject+' rdf:type mo:MusicArtist .' + sparqlU = sparqlU + '\n'+subject+' myspace:myspaceID "'+filename.rstrip('.rdf')+'"^^xsd:int .' + sparqlU = sparqlU + """\n"""+subject+' foaf:name "' + urllib2.quote(name)+'"@en . ' - # get all the top friends - while(1): - try: - p = person.knows.pop() - ouid = p.URI.split(base)[1] - obj = "<"+myspaceBase+"/"+ouid+">" - sparqlU=sparqlU+ "\n"+subject+" foaf:knows "+ obj+ ' . ' "\n"+subject+" myspace:topFriend "+obj+ ' . ' - sparqlU = sparqlU + '\n'+obj+' rdf:type mo:MusicArtist .' - except: - break + # get all the top friends + while(1): + try: + p = person.knows.pop() + except: + break + else: + ouid = p.URI.split(base)[1] + obj = "<"+myspaceBase+"/"+ouid+">" + sparqlU=sparqlU+ "\n"+subject+" foaf:knows "+ obj+ ' . ' "\n"+subject+" myspace:topFriend "+obj+ ' . ' + sparqlU = sparqlU + '\n'+obj+' rdf:type mo:MusicArtist .' - while(1): + while(1): + try: + thm = person.theme.pop() + except: + debug("breaking from genre pops") + break + else: + thm = thm.URI.split(base)[1] + # do some cleaning, bad genres in there like 35123543.rdf instead of hip hop + if not re.match(".*\.rdf",thm): + debug("adding genre: "+thm) + genre = "<"+myspaceOnt + "#"+urllib2.quote(thm)+">" + sparqlU=sparqlU+ "\n"+subject+ " myspace:genreTag "+ genre+ ' . ' + try: - thm = person.theme.pop() - genre = "<"+myspaceOnt + "#"+urllib2.quote(thm.URI.split(base)[1])+">" - sparqlU=sparqlU+ "\n"+subject+ " myspace:genreTag "+ genre+ ' . ' + playcount = person.tipjar.pop().URI.split(base)[1] + sparqlU=sparqlU+ "\n"+subject+ ' myspace:totalPlays "'+ playcount+'"^^xsd:int . ' except: - break - - try: - playcount = person.tipjar.pop().URI.split(base)[1] - sparqlU=sparqlU+ "\n"+subject+ ' myspace:totalPlays "'+ playcount+'"^^xsd:int . ' - except: - pass + pass - sparqlU=sparqlU+'}' - return sparqlU + sparqlU=sparqlU+'}' + return sparqlU + else: + return None def setLogger(): '''just set the logger''' @@ -216,17 +230,20 @@ sparul = parseRDF(f, base) sparql = SPARQLWrapper.SPARQLWrapper(sparqlEndPoint) sparql.addDefaultGraph(defaultGraph) - - # we have to deal w/ queries that are too long - if len(sparul) > apacheLimit: - debug('query too long, splitting...') - splitSparul = splitQuery(sparul) - for split in splitSparul: - sparql.setQuery(prefixes+split) + if sparul: + # we have to deal w/ queries that are too long + if len(sparul) > apacheLimit: + debug('query too long, splitting...') + splitSparul = splitQuery(sparul) + for split in splitSparul: + sparql.setQuery(prefixes+split) + trySparql(sparql, 0, f) + else: + sparql.setQuery(prefixes+insert+sparul) trySparql(sparql, 0, f) else: - sparql.setQuery(prefixes+insert+sparul) - trySparql(sparql, 0, f) + debug('failure on '+str(f)) + failedList.append(f) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |