[Mypyspace-developer] SF.net SVN: mypyspace:[334] musicGrabber/branches/webserv-branch/ myspace2rdf

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 334
          http://mypyspace.svn.sourceforge.net/mypyspace/?rev=334&view=rev
Author:   kurtjx
Date:     2009-03-08 12:04:23 +0000 (Sun, 08 Mar 2009)

Log Message:
-----------
some bug fixes for getting genre and getting playlistID / artistID - verify the string is indeed an integer cuz sometimes it is junk

Modified Paths:
--------------
    musicGrabber/branches/webserv-branch/myspace2rdf.py

Modified: musicGrabber/branches/webserv-branch/myspace2rdf.py
===================================================================

--- musicGrabber/branches/webserv-branch/myspace2rdf.py	2009-03-08 12:00:15 UTC (rev 333)
+++ musicGrabber/branches/webserv-branch/myspace2rdf.py	2009-03-08 12:04:23 UTC (rev 334)
@@ -18,6 +18,7 @@
 #Am making use of regular expressions
 import re 
 import urllib
+from mpsSong import mpsSong
 
 
 help_message = '''
@@ -44,6 +45,7 @@
 		self.mi = mopy.MusicInfo()
 		
 	def getPage(self):
+		'''just grab the web page'''
 		url = viewProfileURLbase + str(self.uid)
 		resp = try_open(url)
 		if resp==None:
@@ -109,6 +111,7 @@
 				ppd = mopy.foaf.PersonalProfileDocument("")
 				ppd.primaryTopic.set(self.subject)
 				self.mi.add(ppd)
+				# assuming the 'name' tag must be present, if not it's a bad url
 				self.name = scrapePage(self.page, [nameTag[0]], nameTag[1])
 				if self.name:
 					self.subject.name.set(self.name)
@@ -168,13 +171,7 @@
 			self.xmlStruct = dom.parseString(''.join(xmlPage.readlines()))
 			songList = self.xmlStruct.getElementsByTagName('song')
 			for song in songList:
-				'''try:
-					songTitle = unicodedata.normalize('NFKC',song.getAttribute('title')).encode('ascii','ignore')
-				except AttributeError, err:
-					songTitle = str(None)
-				except IndexError, err:
-					songTitle = str(None)
-				#availableAs = song.getAttribute('durl')'''
+				# using ben's mpsSong class
 				thisSong = mpsSong(self, song, 'downloadprefix')
 				thisSong.getUri()
 				
@@ -185,7 +182,6 @@
 					avas = mopy.mo.MusicalItem(availableAs)
 					track.available_as.set(avas)
 					self.mi.add(avas)
-				#track.available_as.set(mopy.rdfs.Resource(availableAs))
 				self.subject.made.add(track)
 				self.mi.add(track)
 				
@@ -201,7 +197,11 @@
 	def scrapeArtistID(self):
 		'''attempt to find via scrape of page the internal artist number.'''
 		try:
-			self.artistID = scrapePage(self.page, [artistIDtag[0]], artistIDtag[1])
+			ids = scrapePageWhile(self.page, artistIDtag[0], artistIDtag[1])
+			for i in ids:
+				if i.isdigit():
+					self.artistID = i
+			# self.artistID = scrapePage(self.page, [artistIDtag[0]], artistIDtag[1])
 			return True
 		except Exception, err:
 			print "Ran into trouble trying to scrape the ArtistID for page from " + self.source  + "\nError::" + str(err)
@@ -210,7 +210,12 @@
 	def scrapePlaylistNumber(self):
 		"""attempts to find via scrape of the internal identifier of an artist's playlist of songs"""
 		try:
-			self.playlistID = scrapePage(self.page, [playlistIDtag[0]], playlistIDtag[1])
+			# make sure we get a digit and not some crap - maybe should to regex
+			ids = scrapePageWhile(self.page, playlistIDtag[0], playlistIDtag[1])
+			for i in ids:
+				if i.isdigit():
+					self.playlistID = i
+			#self.playlistID = scrapePage(self.page, [playlistIDtag[0]], playlistIDtag[1])
 			return True
 		except Exception, err:
 			print "Ran into trouble trying to scrape the playlistID for page from " + self.source  + "\nError::" + str(err)
@@ -293,150 +298,23 @@
 		genreNums = re.findall(''':"(.|..|...)"''', localGenres) # should return only 2 or 3 char string between 
 		genres = []
 		for gnum in genreNums:
-			genre = mopy.mo.Genre(myspaceOntology+urllib.quote(genreDict[int(gnum)]))
-			genre.name.set(genreDict[int(gnum)])
-			self.mi.add(genre)
-			self.subject.genreTag.add(genre)
-			genres.append(genre)
+			try:
+				genre = mopy.mo.Genre(myspaceOntology+urllib.quote(genreDict[int(gnum)]))
+			except KeyError:
+				pass
+			else:
+				genre.name.set(genreDict[int(gnum)])
+				self.mi.add(genre)
+				self.subject.genreTag.add(genre)
+				genres.append(genre)
 
 		return genres
 		
 		
-class mpsSong:
-	"""a class that wraps around the downloading, feature extracting and modeling of a piece of media attached to a mpsUser
-	mpsSong object instances have the following public variables:
-		parent           --     a weakref to the mpsUser that generated the mpsSong instance
-        uri              --     lo res cached download link
-        betterUri        --     hi res cached download link (not always available)
-        downloadprefix   --     local prefix to stick the file when downloaded
-        extractionprefix --     local prefix to stick the feature files when extracted
-        title            --     title of song
-        image            --     url to get image associated with song
-        playcount        --     number of times song has been played via myspace player
-        trackNum         --     track number based on order presented on myspace
-        totalTracks      --     number of songs available for parent
-        filename         --     name used for local lofi file, when downloaded
-        HIFIfilename     --     name used for local hifi file, when downloaded
-        beats            --     local name of beat segmentaton file, used to do variable segment length feature extraction
 
-	"""
-	def __init__(self, parent, xmlNode, downloadprefix = '', extractionprefix = ''):
-		"""initializes the mpsSong class.  Parent is a pointer to the calling mpsUser, xmlNode should be a DOM object with the songs info.  downloadprefix is the local directory prefix where the media will be put, default is an empty string.  If no extractionprefix is given, extracted features will be places in the dir pointed to by downloadprefix"""
-		#self.parent = weakref.ref(parent)
-		self.xmlNode = xmlNode
-		self.getUri()
-		#the nicer file download is currently broken...
-		#self.betterURI = xmlNode.getAttribute('downloadable')
-		self.downloadprefix = downloadprefix
-		if extractionprefix == '':
-			self.extractionprefix = downloadprefix
-		else:
-			self.extractionprefix = extractionprefix
-		self.title = self.exhaustiveXML.getElementsByTagName('title')[0].firstChild.nodeValue
-		self.image = self.exhaustiveXML.getElementsByTagName('small')[0].firstChild.nodeValue
-		self.playcount = xmlNode.getElementsByTagName('stats')[0].getAttribute('plays')
-		self.comments = "" #this is a blank string hold for the comments fields.  Might be used later.
-		self.trackNum, self.totalTracks = None, None
-		self.filename, self.HIFIfilename = None, None
-		self.beats = None
 
-	def getUri(self):
-		self.songID = self.xmlNode.getAttribute('songId')
-		xmlPage = try_open(songBase[0] + str(self.songID) + songBase[1])
-		self.exhaustiveXML = dom.parseString(''.join(xmlPage.readlines()))
-		xmlPage.close()
-		try:
-			self.uri = self.exhaustiveXML.getElementsByTagName('link')[0].firstChild.nodeValue
-		except AttributeError, err:
-			#logging.info("mpsUser::getUri ran into a problem finding the download link for a song by artist with uid: " + 
-			#	str(self.parent().uid) + " link will be left blank.\n\tError msg: " + str(err))
-			pass
-			self.uri = ''
 
-	def setTrackNum(self, trackNumber, totalTracks):
-		'''set the track number for this song and the number of tracks in the album it is in.'''
-		self.trackNum = trackNumber
-		self.totalTracks = totalTracks
 
-	def download(self):
-		'''download the track.  
-		Upon success set self.filename to the local location of the downloaded song and return true.  
-		On FAIL return false.'''
-		logging.debug("downloading " + self.title + " by " + self.parent().artist +  " to " + self.downloadprefix)
-		if self.trackNum != None:
-			filename = unicode(str(self.trackNum), 'utf8')  + u'_' +  self.title + u'.mp3'
-		else:
-			filename =   self.title + u'.mp3'
-		if try_get(self.uri, os.path.join(self.downloadprefix, filename)) != None:
-			logging.debug("success on " +  self.title + " by " + self.parent().artist +  " to " + os.path.join(self.downloadprefix,filename))
-			self.filename = filename
-			return True
-		else:
-			logging.debug("FAIL on " + self.title + " by " + self.parent().artist +  " to " + os.path.join(self.downloadprefix,filename))
-			return False
-
-	def downloadHIFI(self):
-		'''if it exists, download the hi fidelity version of the track.  
-		Upon success set self.HIFIfilename to the local location of the downloaded song and return true.  
-		On FAIL return false.'''
-		if not self.betterURI:
-			logging.info("NO hi-fi version of " + self.title + " by " + self.parent().artist + " but we did look for it.")
-			return False
-		logging.debug("downloading hifi copy of " + self.title + "by" + self.parent().artist +  " to " + self.downloadprefix)
-		if self.trackNum != None:
-			filename = unicode(str(self.trackNum), 'utf8') + u'_' + self.title + u'_hifi.mp3'
-		else:
-			filename =  self.title + u'_hifi.mp3'
-		if (try_get(self.betteruri, os.path.join(self.downloadprefix,filename)) != None):
-			logging.debug("success on hi-fi version of " + self.title + " by " + self.parent().artist +  " to " + os.path.join(self.downloadprefix,filename))
-			self.HIFIfilename = filename
-			return True
-		else:
-			logging.debug("FAIL on hi-fi version of " + self.title + " by " + self.parent().artist +  " to " + os.path.join(self.downloadprefix,filename))
-			return False
-
-
-	def tag(self, hifi = False):
-		'''create or modify the id3 tag for downloaded song associated with self. set optional hifi arg to tag the hifi download'''
-		if hifi:
-			fileToTag = os.path.join(self.downloadprefix,self.HIFIfilename)
-		else:
-			fileToTag = os.path.join(self.downloadprefix,self.filename)
-		if fileToTag == None:
-			logging.info("asked to tag a file associated with uid: " + str(self.parent().uid) + " but the song does not exist locally")			
-		logging.debug("adding tags to " + fileToTag)
-		try: id3 = mutagen.id3.ID3(fileToTag)
-		except mutagen.id3.ID3NoHeaderError:
-			logging.info("No ID3 header found for " + fileToTag + "; creating tag from scratch")
-			id3 = mutagen.id3.ID3()
-		except Exception, err:
-			logging.error(str(err))
-			return
-		id3.add(mutagen.id3.TIT2(encoding=3,text=self.title))
-		id3.add(mutagen.id3.TPE1(encoding=3,text=self.parent().artist))
-		id3.add(mutagen.id3.COMM(encoding=3,text=self.comments, lang="eng", desc=""))
-		#id3.add(mutagen.id3.COMM(encoding=3,text=relationshipLink, lang="eng", desc="MusicGrabberSig"))	
-		id3.add(mutagen.id3.TALB(encoding=3,text=self.parent().album))
-		if self.trackNum != None:
-			id3.add(mutagen.id3.TRCK(encoding=3,text=str(self.trackNum) + '/' + str(self.totalTracks)))
-		id3.add(mutagen.id3.POPM(encoding=3,email=str(self.parent().uid)+"@myspace", rating = 128, count=self.playcount))
-		if self.image == None:
-			logging.error("No image present for " + self.title + ", " + self.parent().artist)
-		try:
-			logging.debug("trying to get image from " + self.image)
-			localImgPath, imgHeader = try_get(self.image, os.path.join("/tmp",os.path.basename(self.image)))
-			imgHandle = open(localImgPath)
-			id3.add(mutagen.id3.APIC(encoding=3, mime=imgHeader.type, data=imgHandle.read(), type=17, desc="Song pic from myspace.com"))
-		except:
-			logging.error("Unable to retieve image for " + self.title + ", " + self.parent().artist)
-		try:
-			id3.save(fileToTag)
-		except Exception, err:
-			logging.error(str(err) + ";couldn\'t save the tag for " + self.title + " by " + self.parent().artist)
-
-
-
-
 def main(argv=None):
 	if argv is None:
 		argv = sys.argv


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Mypyspace-developer] SF.net SVN: mypyspace:[334] musicGrabber/branches/webserv-branch/ myspace2rdf

[Mypyspace-developer] SF.net SVN: mypyspace:[334] musicGrabber/branches/webserv-branch/ myspace2rdf.py