From: <gea...@us...> - 2009-01-16 17:24:34
|
Revision: 312 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=312&view=rev Author: gearmonkey Date: 2009-01-16 17:24:24 +0000 (Fri, 16 Jan 2009) Log Message: ----------- The most notable change in this rev is the change to the genre tag. It was picking up loads of garbage with artist with no genres listed. This was fixed by removing all the whitespace from the scrape tag, replacing the closing tag (it use to be a single carriage return) and cleaning up the whitespace stripping mechanism for genre in RDFtrans. This seems to result in correct answers for artists with no listed genre (no genre entry in the rdf file) instead of gibberish. I think the rdf generated by RDFtrans inside the myspaceCrawler project is actually bordering on sensible now (it's been valid since r309, but now it actually makes sense). The most notable exception is that there are still some oddities in the myspace ontology namespace that need to be dealt with (the name space is showing as default5 instead of myspace). Modified Paths: -------------- myspaceCrawler/trunk/RDFtrans.py myspaceCrawler/trunk/myspaceuris.py myspaceCrawler/trunk/scraping.py Modified: myspaceCrawler/trunk/RDFtrans.py =================================================================== --- myspaceCrawler/trunk/RDFtrans.py 2009-01-15 20:06:13 UTC (rev 311) +++ myspaceCrawler/trunk/RDFtrans.py 2009-01-16 17:24:24 UTC (rev 312) @@ -86,18 +86,26 @@ friendUIDs = scrapePageWhile(self.HTML, friendTag[0], friendTag[1]) friendNames = scrapePageWhile(self.HTML, friendNameTag[0], friendNameTag[1]) friendPics = scrapePageWhile(self.HTML, friendPicTag[0], friendPicTag[1]) - - for i in range(len(friendUIDs)): - friend = mopy.myspace.Agent(self.NSprefix + str(friendUIDs[i])) + + if len(friendUIDs) != len(friendNames): + logging.info("Ther seems to be a different number of friend names (" + str(len(friendNames)) + + ") than friend IDs (" + str(len(friendUIDs)) + ") scraped off uid #" + str(self.uid) +".\nverify rdf.") + if len(friendUIDs) != len(friendPics): + logging.info("Ther seems to be a different number of friend pictures (" + str(len(friendPics)) + + ") than friend IDs (" + str(len(friendUIDs)) + ") scraped off uid #" + str(self.uid) +".\nverify rdf.") + + for idx, friendUID in enumerate(friendUIDs): + friend = mopy.myspace.Agent(self.NSprefix + str(friendUID)) try: - friend.name.set(friendNames[i]) + friend.name.set(friendNames[idx]) + logging.debug("adding friend with uid " + str(friendUID) + " whose name is " + str(friendNames[idx])) except Exception, err: logging.error("A friend name mismatch occurred in the rdf translation.\nRDFtrans::getFriends::" + str(err)) # refer to dbtune incase this friend isnt in crawl - thing = mopy.owl.Thing(dbtuneMyspace + 'uid/' + str(friendUIDs[i])) + thing = mopy.owl.Thing(dbtuneMyspace + 'uid/' + str(friendUID)) friend.sameAs.set(thing) try: - img = mopy.foaf.Image(friendPics[i]) + img = mopy.foaf.Image(friendPics[idx]) friend.depiction.add(img) self.mi.add(img) except: @@ -203,13 +211,13 @@ genreraw = scrapePage(self.HTML, genreTag[0], genreTag[1]) if genreraw == None: return genreraw - genreraw = str(genreraw).lstrip() - genreraw = genreraw.rstrip() + genreraw = str(genreraw).strip() + if genreraw == '': + return None genres = genreraw.split('/') genresfixed = [] for genre in genres: - genre = genre.rstrip() - genre = genre.lstrip() + genre = genre.strip() g = mopy.mo.Genre(myspaceOwlURI+'#'+urllib.quote(str(genre))) g.name.set(genre) self.mi.add(g) Modified: myspaceCrawler/trunk/myspaceuris.py =================================================================== --- myspaceCrawler/trunk/myspaceuris.py 2009-01-15 20:06:13 UTC (rev 311) +++ myspaceCrawler/trunk/myspaceuris.py 2009-01-16 17:24:24 UTC (rev 312) @@ -10,11 +10,11 @@ # useful tags playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';''' # ### this tag will be terminated by a '.' ### -friendTag = '''<td bgcolor="FFFFFF" align="center" valign="top" width="107" style="word-wrap:break-word">\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"''' +friendTag = ''' <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"''' # new tag updated 13/1/2009 #""" <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewprofile&friendid=""", '''"''' # ### tag will be terminated by a '"' ### -friendNameTag = """_friendLink">""", '''<''' +friendNameTag = '''_friendLink">''', '''<''' ### tag terminated by '<' ### userIDTag = '''"DisplayFriendId":''', ''',"IsLoggedIn"''' # 13/1/2009 @@ -24,7 +24,8 @@ # ### tag terminated by a ';' ### nameTag = """<span class="nametext">""", '''<''' # ### tag term by '<' -genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>\r\n\t\t\t\t\t''', ''' \r''' +#the returned identifier will inevitably be surrouned by whitespace that will need to be stripped +genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>''', '''</strong>''' #'''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>''', '''<''' # ### tag terminated by '<' niceURLTag = '''<td><div align="left"> <span><a href="''', '''">''' Modified: myspaceCrawler/trunk/scraping.py =================================================================== --- myspaceCrawler/trunk/scraping.py 2009-01-15 20:06:13 UTC (rev 311) +++ myspaceCrawler/trunk/scraping.py 2009-01-16 17:24:24 UTC (rev 312) @@ -35,7 +35,7 @@ logging.debug("Found identifier : "+identifier) return identifier; -def scrapePageWhile(page, patterns, termChar): +def scrapePageWhile(page, pattern, termChar): """Scrape the page given for each pattern and return a list with each identifier occurring after the last pattern (which is assumed to be terminated by termChar)""" @@ -44,8 +44,8 @@ idx_end = len(page) identifiers = [] itsFound = 1 + logging.debug("pattern : "+ pattern) while itsFound: - pattern = patterns idx = page.find(pattern, idx) #logging.debug("idx = "+str(idx)) if (idx > idx_end): # Couldn't find this pattern before re-occurrence of last pattern @@ -59,7 +59,7 @@ #logging.debug("idx_end = "+str(idx_end)) if idx != -1: - idx += len(patterns) + idx += len(pattern) # idx should now point to the start of the identifier we want id_end = page.find(termChar, idx) identifier = unicode(page[idx:id_end], 'utf8') This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |