|
From: <gea...@us...> - 2009-01-23 16:57:46
|
Revision: 314
http://mypyspace.svn.sourceforge.net/mypyspace/?rev=314&view=rev
Author: gearmonkey
Date: 2009-01-23 16:57:40 +0000 (Fri, 23 Jan 2009)
Log Message:
-----------
removed the playerTag as it's broken and unreliable. Replaced the artist check functionality by checking for genre formatting tags. The robustness of this method is to be determined, however it seems to work in most cases.
Also, slightly altered some of the parameters used in url grabbing.
Modified Paths:
--------------
myspaceCrawler/trunk/RDFtrans.py
myspaceCrawler/trunk/mpsUser.py
myspaceCrawler/trunk/myspaceuris.py
myspaceCrawler/trunk/tryurl.py
Modified: myspaceCrawler/trunk/RDFtrans.py
===================================================================
--- myspaceCrawler/trunk/RDFtrans.py 2009-01-16 17:59:29 UTC (rev 313)
+++ myspaceCrawler/trunk/RDFtrans.py 2009-01-23 16:57:40 UTC (rev 314)
@@ -61,12 +61,15 @@
def isArtist(self):
'''is current page an artist???'''
if self.HTML:
- player = scrapePage(self.HTML, playerTag[0], playerTag[1])
+ if genreTag[0] in self.page:
+ artist = True
+ else:
+ artist = False
if not scrapePage(self.HTML, nameTag[0], nameTag[1]) == None:
self.name = scrapePage(self.HTML, nameTag[0], nameTag[1])
else:
self.name = str(None)
- if player:
+ if artist:
# make the mopy subject a myspace:MusicArtist
self.subject = mopy.myspace.MusicArtist(self.NSprefix+str(self.uid))
# set the subject name
Modified: myspaceCrawler/trunk/mpsUser.py
===================================================================
--- myspaceCrawler/trunk/mpsUser.py 2009-01-16 17:59:29 UTC (rev 313)
+++ myspaceCrawler/trunk/mpsUser.py 2009-01-23 16:57:40 UTC (rev 314)
@@ -146,8 +146,8 @@
return xmlStruct
def artistCheck(self):
- '''for a given mpsUser with read source, check to see if it is an artist profile'''
- if playerTag[0] in self.page:
+ '''for a given mpsUser with read source, check to see if it is an artist profile. This is done by examining the html source for the presence of genre labels. Note that even an artist without genre tags, will have these bits of markup, they will simply be blank.'''
+ if genreTag[0] in self.page:
return True
else:
return False
Modified: myspaceCrawler/trunk/myspaceuris.py
===================================================================
--- myspaceCrawler/trunk/myspaceuris.py 2009-01-16 17:59:29 UTC (rev 313)
+++ myspaceCrawler/trunk/myspaceuris.py 2009-01-23 16:57:40 UTC (rev 314)
@@ -8,7 +8,8 @@
#########################################################################################################
# useful tags
-playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';'''
+#the player tag is broken, so we're going to use the genre tag as an artist check
+#playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';'''
# ### this tag will be terminated by a '.' ###
friendTag = ''' <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"'''
# new tag updated 13/1/2009
Modified: myspaceCrawler/trunk/tryurl.py
===================================================================
--- myspaceCrawler/trunk/tryurl.py 2009-01-16 17:59:29 UTC (rev 313)
+++ myspaceCrawler/trunk/tryurl.py 2009-01-23 16:57:40 UTC (rev 314)
@@ -5,8 +5,8 @@
#keepalive comes from the urlgrabber project, licensed under GPL and available here: http://linux.duke.edu/projects/urlgrabber/
import logging
#changing to urllib2 and using a recently added timeout feature, so that the socket will timeout after TIMEOUT seconds
-TIMEOUT = 12
-SLEEPTIME = .25
+TIMEOUT = 15
+SLEEPTIME = 5
#use the following three lines and import keepalive to use the keep alive urlopener
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|