From: <gea...@us...> - 2009-01-16 17:24:34
|
Revision: 312 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=312&view=rev Author: gearmonkey Date: 2009-01-16 17:24:24 +0000 (Fri, 16 Jan 2009) Log Message: ----------- The most notable change in this rev is the change to the genre tag. It was picking up loads of garbage with artist with no genres listed. This was fixed by removing all the whitespace from the scrape tag, replacing the closing tag (it use to be a single carriage return) and cleaning up the whitespace stripping mechanism for genre in RDFtrans. This seems to result in correct answers for artists with no listed genre (no genre entry in the rdf file) instead of gibberish. I think the rdf generated by RDFtrans inside the myspaceCrawler project is actually bordering on sensible now (it's been valid since r309, but now it actually makes sense). The most notable exception is that there are still some oddities in the myspace ontology namespace that need to be dealt with (the name space is showing as default5 instead of myspace). Modified Paths: -------------- myspaceCrawler/trunk/RDFtrans.py myspaceCrawler/trunk/myspaceuris.py myspaceCrawler/trunk/scraping.py Modified: myspaceCrawler/trunk/RDFtrans.py =================================================================== --- myspaceCrawler/trunk/RDFtrans.py 2009-01-15 20:06:13 UTC (rev 311) +++ myspaceCrawler/trunk/RDFtrans.py 2009-01-16 17:24:24 UTC (rev 312) @@ -86,18 +86,26 @@ friendUIDs = scrapePageWhile(self.HTML, friendTag[0], friendTag[1]) friendNames = scrapePageWhile(self.HTML, friendNameTag[0], friendNameTag[1]) friendPics = scrapePageWhile(self.HTML, friendPicTag[0], friendPicTag[1]) - - for i in range(len(friendUIDs)): - friend = mopy.myspace.Agent(self.NSprefix + str(friendUIDs[i])) + + if len(friendUIDs) != len(friendNames): + logging.info("Ther seems to be a different number of friend names (" + str(len(friendNames)) + + ") than friend IDs (" + str(len(friendUIDs)) + ") scraped off uid #" + str(self.uid) +".\nverify rdf.") + if len(friendUIDs) != len(friendPics): + logging.info("Ther seems to be a different number of friend pictures (" + str(len(friendPics)) + + ") than friend IDs (" + str(len(friendUIDs)) + ") scraped off uid #" + str(self.uid) +".\nverify rdf.") + + for idx, friendUID in enumerate(friendUIDs): + friend = mopy.myspace.Agent(self.NSprefix + str(friendUID)) try: - friend.name.set(friendNames[i]) + friend.name.set(friendNames[idx]) + logging.debug("adding friend with uid " + str(friendUID) + " whose name is " + str(friendNames[idx])) except Exception, err: logging.error("A friend name mismatch occurred in the rdf translation.\nRDFtrans::getFriends::" + str(err)) # refer to dbtune incase this friend isnt in crawl - thing = mopy.owl.Thing(dbtuneMyspace + 'uid/' + str(friendUIDs[i])) + thing = mopy.owl.Thing(dbtuneMyspace + 'uid/' + str(friendUID)) friend.sameAs.set(thing) try: - img = mopy.foaf.Image(friendPics[i]) + img = mopy.foaf.Image(friendPics[idx]) friend.depiction.add(img) self.mi.add(img) except: @@ -203,13 +211,13 @@ genreraw = scrapePage(self.HTML, genreTag[0], genreTag[1]) if genreraw == None: return genreraw - genreraw = str(genreraw).lstrip() - genreraw = genreraw.rstrip() + genreraw = str(genreraw).strip() + if genreraw == '': + return None genres = genreraw.split('/') genresfixed = [] for genre in genres: - genre = genre.rstrip() - genre = genre.lstrip() + genre = genre.strip() g = mopy.mo.Genre(myspaceOwlURI+'#'+urllib.quote(str(genre))) g.name.set(genre) self.mi.add(g) Modified: myspaceCrawler/trunk/myspaceuris.py =================================================================== --- myspaceCrawler/trunk/myspaceuris.py 2009-01-15 20:06:13 UTC (rev 311) +++ myspaceCrawler/trunk/myspaceuris.py 2009-01-16 17:24:24 UTC (rev 312) @@ -10,11 +10,11 @@ # useful tags playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';''' # ### this tag will be terminated by a '.' ### -friendTag = '''<td bgcolor="FFFFFF" align="center" valign="top" width="107" style="word-wrap:break-word">\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"''' +friendTag = ''' <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"''' # new tag updated 13/1/2009 #""" <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewprofile&friendid=""", '''"''' # ### tag will be terminated by a '"' ### -friendNameTag = """_friendLink">""", '''<''' +friendNameTag = '''_friendLink">''', '''<''' ### tag terminated by '<' ### userIDTag = '''"DisplayFriendId":''', ''',"IsLoggedIn"''' # 13/1/2009 @@ -24,7 +24,8 @@ # ### tag terminated by a ';' ### nameTag = """<span class="nametext">""", '''<''' # ### tag term by '<' -genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>\r\n\t\t\t\t\t''', ''' \r''' +#the returned identifier will inevitably be surrouned by whitespace that will need to be stripped +genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>''', '''</strong>''' #'''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>''', '''<''' # ### tag terminated by '<' niceURLTag = '''<td><div align="left"> <span><a href="''', '''">''' Modified: myspaceCrawler/trunk/scraping.py =================================================================== --- myspaceCrawler/trunk/scraping.py 2009-01-15 20:06:13 UTC (rev 311) +++ myspaceCrawler/trunk/scraping.py 2009-01-16 17:24:24 UTC (rev 312) @@ -35,7 +35,7 @@ logging.debug("Found identifier : "+identifier) return identifier; -def scrapePageWhile(page, patterns, termChar): +def scrapePageWhile(page, pattern, termChar): """Scrape the page given for each pattern and return a list with each identifier occurring after the last pattern (which is assumed to be terminated by termChar)""" @@ -44,8 +44,8 @@ idx_end = len(page) identifiers = [] itsFound = 1 + logging.debug("pattern : "+ pattern) while itsFound: - pattern = patterns idx = page.find(pattern, idx) #logging.debug("idx = "+str(idx)) if (idx > idx_end): # Couldn't find this pattern before re-occurrence of last pattern @@ -59,7 +59,7 @@ #logging.debug("idx_end = "+str(idx_end)) if idx != -1: - idx += len(patterns) + idx += len(pattern) # idx should now point to the start of the identifier we want id_end = page.find(termChar, idx) identifier = unicode(page[idx:id_end], 'utf8') This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <gea...@us...> - 2009-01-16 17:59:35
|
Revision: 313 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=313&view=rev Author: gearmonkey Date: 2009-01-16 17:59:29 +0000 (Fri, 16 Jan 2009) Log Message: ----------- corrected the default base uri for rdf generated via mpsUser. Modified Paths: -------------- musicGrabber/branches/webserv-branch/myspace2rdf.py myspaceCrawler/tags/0.8.1b_release/myspaceCrawler.py myspaceCrawler/trunk/mpsUser.py myspaceCrawler/trunk/myspaceuris.py Modified: musicGrabber/branches/webserv-branch/myspace2rdf.py =================================================================== --- musicGrabber/branches/webserv-branch/myspace2rdf.py 2009-01-16 17:24:24 UTC (rev 312) +++ musicGrabber/branches/webserv-branch/myspace2rdf.py 2009-01-16 17:59:29 UTC (rev 313) @@ -102,10 +102,8 @@ - should switch to check for genre tags instead???''' if self.page: - ############################################# - # kludge set playr to always flase for now ## - ############################################# - player = False #= scrapePage(self.page, [playerTag[0]], playerTag[1]) + + player = scrapePage(self.page, [playerTag[0]], playerTag[1]) if player: self.subject = mopy.mo.MusicArtist(dbtuneMyspace+'uid/'+str(self.uid)) #self.subjecttwo = mopy.foaf.Person('http://dbtune.org/myspace/uid/'+str(self.uid)) Modified: myspaceCrawler/tags/0.8.1b_release/myspaceCrawler.py =================================================================== --- myspaceCrawler/tags/0.8.1b_release/myspaceCrawler.py 2009-01-16 17:24:24 UTC (rev 312) +++ myspaceCrawler/tags/0.8.1b_release/myspaceCrawler.py 2009-01-16 17:59:29 UTC (rev 313) @@ -32,7 +32,7 @@ -THREAD_CAP = 30 #maximum number of threads allowed to be firing at once +THREAD_CAP = 10000 #maximum number of threads allowed to be firing at once THREAD_STALL_TIME = 30 #length of time in seconds to wait until the thread count is checked again LOG_FILENAME = "musicCrawler.log" #name of logger file (path set at commandline) Modified: myspaceCrawler/trunk/mpsUser.py =================================================================== --- myspaceCrawler/trunk/mpsUser.py 2009-01-16 17:24:24 UTC (rev 312) +++ myspaceCrawler/trunk/mpsUser.py 2009-01-16 17:59:29 UTC (rev 313) @@ -55,7 +55,7 @@ ''' - def __init__(self, url, rdfprefix = dbtuneMyspace): + def __init__(self, url, rdfprefix = dbtuneMyspace + 'uid/'): """Initialization will set the source url, attempt to create a socket connection with the url and determine if this mpsUser is an artist. If the user given is an artist, the initialization will also scrape the top Friends. rdfprefix is the uri base prepended to the uids of other myspace resources, by default it is set to the dbtune live service""" self.source = url self.uid = -1 Modified: myspaceCrawler/trunk/myspaceuris.py =================================================================== --- myspaceCrawler/trunk/myspaceuris.py 2009-01-16 17:24:24 UTC (rev 312) +++ myspaceCrawler/trunk/myspaceuris.py 2009-01-16 17:59:29 UTC (rev 313) @@ -46,7 +46,6 @@ ######################################################################################################### - # myspace uri for downloads ----this has gotten a bit more complicated in the roll out of myspace's new media player # this xml file gives the songIDs, the songsIDs must be used individually to request another xml file that then contains the uri to the cached media # @@ -75,7 +74,6 @@ myspaceOwlURI = 'http://grasstunes.net/ontology/myspace.owl' dbtuneMyspace = 'http://dbtune.org/myspace/' - countries = ['Afghanistan', 'Albania', 'Algeria', 'American Samoa','Andorra', 'Angola','Anguilla','Antarctica','Antigua and Barbuda','Argentina', 'Armenia','Aruba','Australia','Austria','Azerbaijan','Bahamas', This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <gea...@us...> - 2009-01-23 16:57:46
|
Revision: 314 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=314&view=rev Author: gearmonkey Date: 2009-01-23 16:57:40 +0000 (Fri, 23 Jan 2009) Log Message: ----------- removed the playerTag as it's broken and unreliable. Replaced the artist check functionality by checking for genre formatting tags. The robustness of this method is to be determined, however it seems to work in most cases. Also, slightly altered some of the parameters used in url grabbing. Modified Paths: -------------- myspaceCrawler/trunk/RDFtrans.py myspaceCrawler/trunk/mpsUser.py myspaceCrawler/trunk/myspaceuris.py myspaceCrawler/trunk/tryurl.py Modified: myspaceCrawler/trunk/RDFtrans.py =================================================================== --- myspaceCrawler/trunk/RDFtrans.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/RDFtrans.py 2009-01-23 16:57:40 UTC (rev 314) @@ -61,12 +61,15 @@ def isArtist(self): '''is current page an artist???''' if self.HTML: - player = scrapePage(self.HTML, playerTag[0], playerTag[1]) + if genreTag[0] in self.page: + artist = True + else: + artist = False if not scrapePage(self.HTML, nameTag[0], nameTag[1]) == None: self.name = scrapePage(self.HTML, nameTag[0], nameTag[1]) else: self.name = str(None) - if player: + if artist: # make the mopy subject a myspace:MusicArtist self.subject = mopy.myspace.MusicArtist(self.NSprefix+str(self.uid)) # set the subject name Modified: myspaceCrawler/trunk/mpsUser.py =================================================================== --- myspaceCrawler/trunk/mpsUser.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/mpsUser.py 2009-01-23 16:57:40 UTC (rev 314) @@ -146,8 +146,8 @@ return xmlStruct def artistCheck(self): - '''for a given mpsUser with read source, check to see if it is an artist profile''' - if playerTag[0] in self.page: + '''for a given mpsUser with read source, check to see if it is an artist profile. This is done by examining the html source for the presence of genre labels. Note that even an artist without genre tags, will have these bits of markup, they will simply be blank.''' + if genreTag[0] in self.page: return True else: return False Modified: myspaceCrawler/trunk/myspaceuris.py =================================================================== --- myspaceCrawler/trunk/myspaceuris.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/myspaceuris.py 2009-01-23 16:57:40 UTC (rev 314) @@ -8,7 +8,8 @@ ######################################################################################################### # useful tags -playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';''' +#the player tag is broken, so we're going to use the genre tag as an artist check +#playerTag = """SWFObject("http://musicservices.myspace.com/Modules/MusicServices/Services/Embed.ashx/ptype=4""", ''';''' # ### this tag will be terminated by a '.' ### friendTag = ''' <a href="http://profile.myspace.com/index.cfm?fuseaction=user.viewProfile&friendID=''', '''"''' # new tag updated 13/1/2009 Modified: myspaceCrawler/trunk/tryurl.py =================================================================== --- myspaceCrawler/trunk/tryurl.py 2009-01-16 17:59:29 UTC (rev 313) +++ myspaceCrawler/trunk/tryurl.py 2009-01-23 16:57:40 UTC (rev 314) @@ -5,8 +5,8 @@ #keepalive comes from the urlgrabber project, licensed under GPL and available here: http://linux.duke.edu/projects/urlgrabber/ import logging #changing to urllib2 and using a recently added timeout feature, so that the socket will timeout after TIMEOUT seconds -TIMEOUT = 12 -SLEEPTIME = .25 +TIMEOUT = 15 +SLEEPTIME = 5 #use the following three lines and import keepalive to use the keep alive urlopener This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <gea...@us...> - 2009-01-27 13:55:12
|
Revision: 319 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=319&view=rev Author: gearmonkey Date: 2009-01-27 13:55:06 +0000 (Tue, 27 Jan 2009) Log Message: ----------- fixed the image url problem, now actually point to song images not myspace default filler images. added a bit more inline documentation to the api examples code. Modified Paths: -------------- myspaceCrawler/trunk/examples.py myspaceCrawler/trunk/mpsUser.py Modified: myspaceCrawler/trunk/examples.py =================================================================== --- myspaceCrawler/trunk/examples.py 2009-01-27 12:43:42 UTC (rev 318) +++ myspaceCrawler/trunk/examples.py 2009-01-27 13:55:06 UTC (rev 319) @@ -6,7 +6,7 @@ some simple functions demonstrating mpsUser and mpsSong functionality Created by Benjamin Fields on 2008-11-09. -Copyright (c) 2008 __MyCompanyName__. All rights reserved. +Copyright (c) 2008 Goldsmiths. All rights reserved. """ import sys @@ -36,7 +36,7 @@ return 0 def socialCharts(initArtist, radius, chartLength=1): - '''breadth first crawl of width radius to find most chartLength popular songs from the center initArtist.''' + '''breadth first crawl of width radius to find at most chartLength popular songs from the center initArtist.''' songQueue = [] visitedArtists = [] artistsInThisLevel = [initArtist] Modified: myspaceCrawler/trunk/mpsUser.py =================================================================== --- myspaceCrawler/trunk/mpsUser.py 2009-01-27 12:43:42 UTC (rev 318) +++ myspaceCrawler/trunk/mpsUser.py 2009-01-27 13:55:06 UTC (rev 319) @@ -46,11 +46,13 @@ isArtist -- Boolean, True means instance describes a MySpace artist with media rdfprefix -- prefix for all rdf UIRs page -- locally loaded copy of html pointed to by source + The following are only set if user is found to be an artist mediaXML -- locally loaded (via miniDom) copy of xml describing playlist of media assciated - with myspace Artist (not set in non artists) + with myspace Artist totalPlays -- sum of playcounts of all songs associated with myspace Artist - (not set in non Artist) - artist -- self declared name of artist (not set in non Artist) + artist -- self declared name of artist + artistID -- unique ID possessed by artists only, needed to retrieve media and media related meta data + playlistID -- unique ID used to retrieve playlist found on page ''' @@ -342,7 +344,7 @@ else: self.extractionprefix = extractionprefix self.title = self.exhaustiveXML.getElementsByTagName('title')[0].firstChild.nodeValue - self.image = self.exhaustiveXML.getElementsByTagName('small')[0].firstChild.nodeValue + self.getimage() self.playcount = xmlNode.getElementsByTagName('stats')[0].getAttribute('plays') self.comments = "" #this is a blank string hold for the comments fields. Might be used later. self.trackNum, self.totalTracks = None, None @@ -357,10 +359,31 @@ try: self.uri = self.exhaustiveXML.getElementsByTagName('link')[0].firstChild.nodeValue except AttributeError, err: - logging.info("mpsUser::getUri ran into a problem finding the download link for a song by artist with uid: " + + logging.info("mpsUser::mpsSong::getUri ran into a problem finding the download link for a song by artist with uid: " + str(self.parent().uid) + " link will be left blank.\n\tError msg: " + str(err)) self.uri = '' - + def getimage(self): + '''find an image associated with the song, getting the largest resolution available''' + try: + self.image = self.exhaustiveXML.getElementsByTagName('track')[0].getElementsByTagName('large')[0].firstChild.nodeValue + except AttributeError: + try: + self.image = self.exhaustiveXML.getElementsByTagName('track')[0].getElementsByTagName('medium')[0].firstChild.nodeValue + except AttributeError: + try: + self.image = self.exhaustiveXML.getElementsByTagName('track')[0].getElementsByTagName('small')[0].firstChild.nodeValue + except Exception, err: + logging.info("mpsUser::mpsSong::getimage ran into a problem finding an image for a song by artist with uid: " + + str(self.parent().uid) + " image will be left blank.\n\tError msg: " + str(err)) + self.image = '' + except Exception, err: + logging.info("mpsUser::mpsSong::getimage ran into a problem finding an image for a song by artist with uid: " + + str(self.parent().uid) + " image will be left blank.\n\tError msg: " + str(err)) + self.image = '' + except Exception, err: + logging.info("mpsUser::mpsSong::getimage ran into a problem finding an image for a song by artist with uid: " + + str(self.parent().uid) + " image will be left blank.\n\tError msg: " + str(err)) + self.image = '' def setTrackNum(self, trackNumber, totalTracks): '''set the track number for this song and the number of tracks in the album it is in.''' self.trackNum = trackNumber This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |