From: <gea...@us...> - 2009-01-23 16:57:46
|
Revision: 314 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=314&view=rev Author: gearmonkey Date: 2009-01-23 16:57:40 +0000 (Fri, 23 Jan 2009) Log Message: ----------- removed the playerTag as it's broken and unreliable. Replaced the artist check functionality by checking for genre formatting tags. The robustness of this method is to be determined, however it seems to work in most cases. Also, slightly altered some of the parameters used in url grabbing. Modified Paths: -------------- myspaceCrawler/trunk/RDFtrans.py myspaceCrawler/trunk/mpsUser.py myspaceCrawler/trunk/myspaceuris.py myspaceCrawler/trunk/tryurl.py Modified: myspaceCrawler/trunk/RDFtrans.py =================================================================== --- myspaceCrawler/trunk/RDFtrans.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/RDFtrans.py 2009-01-23 16:57:40 UTC (rev 314) @@ -61,12 +61,15 @@ def isArtist(self): '''is current page an artist???''' if self.HTML: - player = scrapePage(self.HTML, playerTag[0], playerTag[1]) + if genreTag[0] in self.page: + artist = True + else: + artist = False if not scrapePage(self.HTML, nameTag[0], nameTag[1]) == None: self.name = scrapePage(self.HTML, nameTag[0], nameTag[1]) else: self.name = str(None) - if player: + if artist: # make the mopy subject a myspace:MusicArtist self.subject = mopy.myspace.MusicArtist(self.NSprefix+str(self.uid)) # set the subject name Modified: myspaceCrawler/trunk/mpsUser.py =================================================================== --- myspaceCrawler/trunk/mpsUser.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/mpsUser.py 2009-01-23 16:57:40 UTC (rev 314) @@ -146,8 +146,8 @@ return xmlStruct def artistCheck(self): - '''for a given mpsUser with read source, check to see if it is an artist profile''' - if playerTag[0] in self.page: + '''for a given mpsUser with read source, check to see if it is an artist profile. This is done by examining the html source for the presence of genre labels. Note that even an artist without genre tags, will have these bits of markup, they will simply be blank.''' + if genreTag[0] in self.page: return True else: return False Modified: myspaceCrawler/trunk/myspaceuris.py =================================================================== --- myspaceCrawler/trunk/myspaceuris.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/myspaceuris.py 2009-01-23 16:57:40 UTC (rev 314) @@ -8,7 +8,8 @@ ######################################################################################################### # useful tags -playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';''' +#the player tag is broken, so we're going to use the genre tag as an artist check +#playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';''' # ### this tag will be terminated by a '.' ### friendTag = ''' <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"''' # new tag updated 13/1/2009 Modified: myspaceCrawler/trunk/tryurl.py =================================================================== --- myspaceCrawler/trunk/tryurl.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/tryurl.py 2009-01-23 16:57:40 UTC (rev 314) @@ -5,8 +5,8 @@ #keepalive comes from the urlgrabber project, licensed under GPL and available here: http://linux.duke.edu/projects/urlgrabber/ import logging #changing to urllib2 and using a recently added timeout feature, so that the socket will timeout after TIMEOUT seconds -TIMEOUT = 12 -SLEEPTIME = .25 +TIMEOUT = 15 +SLEEPTIME = 5 #use the following three lines and import keepalive to use the keep alive urlopener This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |