From: <ku...@us...> - 2009-03-08 12:04:37
|
Revision: 334 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=334&view=rev Author: kurtjx Date: 2009-03-08 12:04:23 +0000 (Sun, 08 Mar 2009) Log Message: ----------- some bug fixes for getting genre and getting playlistID / artistID - verify the string is indeed an integer cuz sometimes it is junk Modified Paths: -------------- musicGrabber/branches/webserv-branch/myspace2rdf.py Modified: musicGrabber/branches/webserv-branch/myspace2rdf.py =================================================================== --- musicGrabber/branches/webserv-branch/myspace2rdf.py 2009-03-08 12:00:15 UTC (rev 333) +++ musicGrabber/branches/webserv-branch/myspace2rdf.py 2009-03-08 12:04:23 UTC (rev 334) @@ -18,6 +18,7 @@ #Am making use of regular expressions import re import urllib +from mpsSong import mpsSong help_message = ''' @@ -44,6 +45,7 @@ self.mi = mopy.MusicInfo() def getPage(self): + '''just grab the web page''' url = viewProfileURLbase + str(self.uid) resp = try_open(url) if resp==None: @@ -109,6 +111,7 @@ ppd = mopy.foaf.PersonalProfileDocument("") ppd.primaryTopic.set(self.subject) self.mi.add(ppd) + # assuming the 'name' tag must be present, if not it's a bad url self.name = scrapePage(self.page, [nameTag[0]], nameTag[1]) if self.name: self.subject.name.set(self.name) @@ -168,13 +171,7 @@ self.xmlStruct = dom.parseString(''.join(xmlPage.readlines())) songList = self.xmlStruct.getElementsByTagName('song') for song in songList: - '''try: - songTitle = unicodedata.normalize('NFKC',song.getAttribute('title')).encode('ascii','ignore') - except AttributeError, err: - songTitle = str(None) - except IndexError, err: - songTitle = str(None) - #availableAs = song.getAttribute('durl')''' + # using ben's mpsSong class thisSong = mpsSong(self, song, 'downloadprefix') thisSong.getUri() @@ -185,7 +182,6 @@ avas = mopy.mo.MusicalItem(availableAs) track.available_as.set(avas) self.mi.add(avas) - #track.available_as.set(mopy.rdfs.Resource(availableAs)) self.subject.made.add(track) self.mi.add(track) @@ -201,7 +197,11 @@ def scrapeArtistID(self): '''attempt to find via scrape of page the internal artist number.''' try: - self.artistID = scrapePage(self.page, [artistIDtag[0]], artistIDtag[1]) + ids = scrapePageWhile(self.page, artistIDtag[0], artistIDtag[1]) + for i in ids: + if i.isdigit(): + self.artistID = i + # self.artistID = scrapePage(self.page, [artistIDtag[0]], artistIDtag[1]) return True except Exception, err: print "Ran into trouble trying to scrape the ArtistID for page from " + self.source + "\nError::" + str(err) @@ -210,7 +210,12 @@ def scrapePlaylistNumber(self): """attempts to find via scrape of the internal identifier of an artist's playlist of songs""" try: - self.playlistID = scrapePage(self.page, [playlistIDtag[0]], playlistIDtag[1]) + # make sure we get a digit and not some crap - maybe should to regex + ids = scrapePageWhile(self.page, playlistIDtag[0], playlistIDtag[1]) + for i in ids: + if i.isdigit(): + self.playlistID = i + #self.playlistID = scrapePage(self.page, [playlistIDtag[0]], playlistIDtag[1]) return True except Exception, err: print "Ran into trouble trying to scrape the playlistID for page from " + self.source + "\nError::" + str(err) @@ -293,150 +298,23 @@ genreNums = re.findall(''':"(.|..|...)"''', localGenres) # should return only 2 or 3 char string between genres = [] for gnum in genreNums: - genre = mopy.mo.Genre(myspaceOntology+urllib.quote(genreDict[int(gnum)])) - genre.name.set(genreDict[int(gnum)]) - self.mi.add(genre) - self.subject.genreTag.add(genre) - genres.append(genre) + try: + genre = mopy.mo.Genre(myspaceOntology+urllib.quote(genreDict[int(gnum)])) + except KeyError: + pass + else: + genre.name.set(genreDict[int(gnum)]) + self.mi.add(genre) + self.subject.genreTag.add(genre) + genres.append(genre) return genres -class mpsSong: - """a class that wraps around the downloading, feature extracting and modeling of a piece of media attached to a mpsUser - mpsSong object instances have the following public variables: - parent -- a weakref to the mpsUser that generated the mpsSong instance - uri -- lo res cached download link - betterUri -- hi res cached download link (not always available) - downloadprefix -- local prefix to stick the file when downloaded - extractionprefix -- local prefix to stick the feature files when extracted - title -- title of song - image -- url to get image associated with song - playcount -- number of times song has been played via myspace player - trackNum -- track number based on order presented on myspace - totalTracks -- number of songs available for parent - filename -- name used for local lofi file, when downloaded - HIFIfilename -- name used for local hifi file, when downloaded - beats -- local name of beat segmentaton file, used to do variable segment length feature extraction - """ - def __init__(self, parent, xmlNode, downloadprefix = '', extractionprefix = ''): - """initializes the mpsSong class. Parent is a pointer to the calling mpsUser, xmlNode should be a DOM object with the songs info. downloadprefix is the local directory prefix where the media will be put, default is an empty string. If no extractionprefix is given, extracted features will be places in the dir pointed to by downloadprefix""" - #self.parent = weakref.ref(parent) - self.xmlNode = xmlNode - self.getUri() - #the nicer file download is currently broken... - #self.betterURI = xmlNode.getAttribute('downloadable') - self.downloadprefix = downloadprefix - if extractionprefix == '': - self.extractionprefix = downloadprefix - else: - self.extractionprefix = extractionprefix - self.title = self.exhaustiveXML.getElementsByTagName('title')[0].firstChild.nodeValue - self.image = self.exhaustiveXML.getElementsByTagName('small')[0].firstChild.nodeValue - self.playcount = xmlNode.getElementsByTagName('stats')[0].getAttribute('plays') - self.comments = "" #this is a blank string hold for the comments fields. Might be used later. - self.trackNum, self.totalTracks = None, None - self.filename, self.HIFIfilename = None, None - self.beats = None - def getUri(self): - self.songID = self.xmlNode.getAttribute('songId') - xmlPage = try_open(songBase[0] + str(self.songID) + songBase[1]) - self.exhaustiveXML = dom.parseString(''.join(xmlPage.readlines())) - xmlPage.close() - try: - self.uri = self.exhaustiveXML.getElementsByTagName('link')[0].firstChild.nodeValue - except AttributeError, err: - #logging.info("mpsUser::getUri ran into a problem finding the download link for a song by artist with uid: " + - # str(self.parent().uid) + " link will be left blank.\n\tError msg: " + str(err)) - pass - self.uri = '' - def setTrackNum(self, trackNumber, totalTracks): - '''set the track number for this song and the number of tracks in the album it is in.''' - self.trackNum = trackNumber - self.totalTracks = totalTracks - def download(self): - '''download the track. - Upon success set self.filename to the local location of the downloaded song and return true. - On FAIL return false.''' - logging.debug("downloading " + self.title + " by " + self.parent().artist + " to " + self.downloadprefix) - if self.trackNum != None: - filename = unicode(str(self.trackNum), 'utf8') + u'_' + self.title + u'.mp3' - else: - filename = self.title + u'.mp3' - if try_get(self.uri, os.path.join(self.downloadprefix, filename)) != None: - logging.debug("success on " + self.title + " by " + self.parent().artist + " to " + os.path.join(self.downloadprefix,filename)) - self.filename = filename - return True - else: - logging.debug("FAIL on " + self.title + " by " + self.parent().artist + " to " + os.path.join(self.downloadprefix,filename)) - return False - - def downloadHIFI(self): - '''if it exists, download the hi fidelity version of the track. - Upon success set self.HIFIfilename to the local location of the downloaded song and return true. - On FAIL return false.''' - if not self.betterURI: - logging.info("NO hi-fi version of " + self.title + " by " + self.parent().artist + " but we did look for it.") - return False - logging.debug("downloading hifi copy of " + self.title + "by" + self.parent().artist + " to " + self.downloadprefix) - if self.trackNum != None: - filename = unicode(str(self.trackNum), 'utf8') + u'_' + self.title + u'_hifi.mp3' - else: - filename = self.title + u'_hifi.mp3' - if (try_get(self.betteruri, os.path.join(self.downloadprefix,filename)) != None): - logging.debug("success on hi-fi version of " + self.title + " by " + self.parent().artist + " to " + os.path.join(self.downloadprefix,filename)) - self.HIFIfilename = filename - return True - else: - logging.debug("FAIL on hi-fi version of " + self.title + " by " + self.parent().artist + " to " + os.path.join(self.downloadprefix,filename)) - return False - - - def tag(self, hifi = False): - '''create or modify the id3 tag for downloaded song associated with self. set optional hifi arg to tag the hifi download''' - if hifi: - fileToTag = os.path.join(self.downloadprefix,self.HIFIfilename) - else: - fileToTag = os.path.join(self.downloadprefix,self.filename) - if fileToTag == None: - logging.info("asked to tag a file associated with uid: " + str(self.parent().uid) + " but the song does not exist locally") - logging.debug("adding tags to " + fileToTag) - try: id3 = mutagen.id3.ID3(fileToTag) - except mutagen.id3.ID3NoHeaderError: - logging.info("No ID3 header found for " + fileToTag + "; creating tag from scratch") - id3 = mutagen.id3.ID3() - except Exception, err: - logging.error(str(err)) - return - id3.add(mutagen.id3.TIT2(encoding=3,text=self.title)) - id3.add(mutagen.id3.TPE1(encoding=3,text=self.parent().artist)) - id3.add(mutagen.id3.COMM(encoding=3,text=self.comments, lang="eng", desc="")) - #id3.add(mutagen.id3.COMM(encoding=3,text=relationshipLink, lang="eng", desc="MusicGrabberSig")) - id3.add(mutagen.id3.TALB(encoding=3,text=self.parent().album)) - if self.trackNum != None: - id3.add(mutagen.id3.TRCK(encoding=3,text=str(self.trackNum) + '/' + str(self.totalTracks))) - id3.add(mutagen.id3.POPM(encoding=3,email=str(self.parent().uid)+"@myspace", rating = 128, count=self.playcount)) - if self.image == None: - logging.error("No image present for " + self.title + ", " + self.parent().artist) - try: - logging.debug("trying to get image from " + self.image) - localImgPath, imgHeader = try_get(self.image, os.path.join("/tmp",os.path.basename(self.image))) - imgHandle = open(localImgPath) - id3.add(mutagen.id3.APIC(encoding=3, mime=imgHeader.type, data=imgHandle.read(), type=17, desc="Song pic from myspace.com")) - except: - logging.error("Unable to retieve image for " + self.title + ", " + self.parent().artist) - try: - id3.save(fileToTag) - except Exception, err: - logging.error(str(err) + ";couldn\'t save the tag for " + self.title + " by " + self.parent().artist) - - - - def main(argv=None): if argv is None: argv = sys.argv This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |