[Mypyspace-developer] SF.net SVN: mypyspace:[312] myspaceCrawler/trunk

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 312
          http://mypyspace.svn.sourceforge.net/mypyspace/?rev=312&view=rev
Author:   gearmonkey
Date:     2009-01-16 17:24:24 +0000 (Fri, 16 Jan 2009)

Log Message:
-----------
The most notable change in this rev is the change to the genre tag.  It was picking up loads of garbage with artist with no genres listed.  This was fixed by removing all the whitespace from the scrape tag, replacing the closing tag (it use to be a single carriage return) and cleaning up the whitespace stripping mechanism for genre in RDFtrans.  This seems to result in correct answers for artists with no listed genre (no genre entry in the rdf file) instead of gibberish.

I think the rdf generated by RDFtrans inside the myspaceCrawler project is actually bordering on sensible now (it's been valid since r309, but now it actually makes sense).  The most notable exception is that there are still some oddities in the myspace ontology namespace that need to be dealt with (the name space is showing as default5 instead of myspace).

Modified Paths:
--------------
    myspaceCrawler/trunk/RDFtrans.py
    myspaceCrawler/trunk/myspaceuris.py
    myspaceCrawler/trunk/scraping.py

Modified: myspaceCrawler/trunk/RDFtrans.py
===================================================================

--- myspaceCrawler/trunk/RDFtrans.py	2009-01-15 20:06:13 UTC (rev 311)
+++ myspaceCrawler/trunk/RDFtrans.py	2009-01-16 17:24:24 UTC (rev 312)
@@ -86,18 +86,26 @@
 			friendUIDs = scrapePageWhile(self.HTML, friendTag[0], friendTag[1])
 			friendNames = scrapePageWhile(self.HTML, friendNameTag[0], friendNameTag[1])
 			friendPics = scrapePageWhile(self.HTML, friendPicTag[0], friendPicTag[1])
-
-			for i in range(len(friendUIDs)):
-				friend = mopy.myspace.Agent(self.NSprefix + str(friendUIDs[i]))
+			
+			if len(friendUIDs) != len(friendNames):
+				logging.info("Ther seems to be a different number of friend names (" + str(len(friendNames)) + 
+					") than friend IDs (" + str(len(friendUIDs)) + ") scraped off uid #" + str(self.uid) +".\nverify rdf.")
+			if len(friendUIDs) != len(friendPics):
+				logging.info("Ther seems to be a different number of friend pictures (" + str(len(friendPics)) + 
+					") than friend IDs (" + str(len(friendUIDs)) + ") scraped off uid #" + str(self.uid) +".\nverify rdf.")
+				
+			for idx, friendUID in enumerate(friendUIDs):
+				friend = mopy.myspace.Agent(self.NSprefix + str(friendUID))
 				try:
-					friend.name.set(friendNames[i])
+					friend.name.set(friendNames[idx])
+					logging.debug("adding friend with uid " + str(friendUID) + " whose name is " + str(friendNames[idx]))
 				except Exception, err:
 					logging.error("A friend name mismatch occurred in the rdf translation.\nRDFtrans::getFriends::" + str(err))
 				# refer to dbtune incase this friend isnt in crawl
-				thing = mopy.owl.Thing(dbtuneMyspace + 'uid/' + str(friendUIDs[i]))
+				thing = mopy.owl.Thing(dbtuneMyspace + 'uid/' + str(friendUID))
 				friend.sameAs.set(thing)
 				try:
-					img = mopy.foaf.Image(friendPics[i])
+					img = mopy.foaf.Image(friendPics[idx])
 					friend.depiction.add(img)
 					self.mi.add(img)
 				except:
@@ -203,13 +211,13 @@
 		genreraw = scrapePage(self.HTML, genreTag[0], genreTag[1])
 		if genreraw == None:
 			return genreraw
-		genreraw = str(genreraw).lstrip()
-		genreraw = genreraw.rstrip()
+		genreraw = str(genreraw).strip()
+		if genreraw == '':
+			return None
 		genres = genreraw.split('/')
 		genresfixed = []
 		for genre in genres:
-			genre = genre.rstrip()
-			genre = genre.lstrip()
+			genre = genre.strip()
 			g = mopy.mo.Genre(myspaceOwlURI+'#'+urllib.quote(str(genre)))
 			g.name.set(genre)
 			self.mi.add(g)

Modified: myspaceCrawler/trunk/myspaceuris.py
===================================================================
--- myspaceCrawler/trunk/myspaceuris.py	2009-01-15 20:06:13 UTC (rev 311)
+++ myspaceCrawler/trunk/myspaceuris.py	2009-01-16 17:24:24 UTC (rev 312)
@@ -10,11 +10,11 @@
 # useful tags
 playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';'''
 #						###	this tag will be terminated by a '.' ###
-friendTag = '''<td bgcolor="FFFFFF" align="center" valign="top" width="107" style="word-wrap:break-word">\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t&nbsp;<a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"'''
+friendTag = '''&nbsp;<a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"'''
 # new tag updated 13/1/2009
 #"""&nbsp;<a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewprofile&friendid=""", '''"'''
 #						### tag will be terminated by a '"' ###
-friendNameTag = """_friendLink">""", '''<'''
+friendNameTag = '''_friendLink">''', '''<'''
 						### tag terminated by '<' ###
 userIDTag = '''"DisplayFriendId":''', ''',"IsLoggedIn"'''
 # 13/1/2009
@@ -24,7 +24,8 @@
 #						### tag terminated by a ';' ###
 nameTag = """<span class="nametext">""", '''<'''
 #						### tag term by '<'
-genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>\r\n\t\t\t\t\t''', ''' \r'''
+#the returned identifier will inevitably be surrouned by whitespace that will need to be stripped
+genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>''', '''</strong>'''
 #'''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>''', '''<'''
 #						### tag terminated by '<'
 niceURLTag = '''<td><div align="left">&nbsp;&nbsp;<span><a href="''', '''">'''

Modified: myspaceCrawler/trunk/scraping.py
===================================================================
--- myspaceCrawler/trunk/scraping.py	2009-01-15 20:06:13 UTC (rev 311)
+++ myspaceCrawler/trunk/scraping.py	2009-01-16 17:24:24 UTC (rev 312)
@@ -35,7 +35,7 @@
 	logging.debug("Found identifier : "+identifier)
 	return identifier;
 	
-def scrapePageWhile(page, patterns, termChar):
+def scrapePageWhile(page, pattern, termChar):
 	"""Scrape the page given for each pattern and return a list with each identifier occurring after the
 	  last pattern (which is assumed to be terminated by termChar)"""
 	
@@ -44,8 +44,8 @@
 	idx_end = len(page)
 	identifiers = []
 	itsFound = 1
+	logging.debug("pattern : "+ pattern)
 	while itsFound:
-		pattern = patterns
 		idx = page.find(pattern, idx)
 		#logging.debug("idx = "+str(idx))
 		if (idx > idx_end): # Couldn't find this pattern before re-occurrence of last pattern
@@ -59,7 +59,7 @@
 		#logging.debug("idx_end = "+str(idx_end))
 	
 		if idx != -1:
-			idx += len(patterns)
+			idx += len(pattern)
 		# idx should now point to the start of the identifier we want
 		id_end = page.find(termChar, idx)
 		identifier = unicode(page[idx:id_end], 'utf8')


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.