From: <ku...@us...> - 2009-02-03 15:15:46
|
Revision: 320 http://mypyspace.svn.sourceforge.net/mypyspace/?rev=320&view=rev Author: kurtjx Date: 2009-02-03 15:15:36 +0000 (Tue, 03 Feb 2009) Log Message: ----------- fixed a bug in the genre scraping Modified Paths: -------------- musicGrabber/branches/webserv-branch/myspace2rdf.py musicGrabber/branches/webserv-branch/myspaceuris.py Modified: musicGrabber/branches/webserv-branch/myspace2rdf.py =================================================================== --- musicGrabber/branches/webserv-branch/myspace2rdf.py 2009-01-27 13:55:06 UTC (rev 319) +++ musicGrabber/branches/webserv-branch/myspace2rdf.py 2009-02-03 15:15:36 UTC (rev 320) @@ -263,7 +263,7 @@ def scrapeGenre(self): - genreraw = scrapePage(self.page, [genreTag[0]], genreTag[1]) + '''genreraw = scrapePage(self.page, [genreTag[0]], genreTag[1]) if genreraw == None: return genreraw genreraw = str(genreraw).lstrip() @@ -278,8 +278,22 @@ self.mi.add(g) self.subject.genreTag.add(g) genresfixed.append(genre) - return genresfixed + return genresfixed''' + localGenres = scrapePage(self.page, [genreTag[0]], genreTag[1]) + if localGenres == None: + return None + genreNums = re.findall(''':"(.|..|...)"''', localGenres) # should return only 2 or 3 char string between + genres = [] + for gnum in genreNums: + genre = mopy.mo.Genre(myspaceOntology+urllib.quote(genreDict[int(gnum)])) + genre.name.set(genreDict[int(gnum)]) + self.mi.add(genre) + self.subject.genreTag.add(genre) + genres.append(genre) + + return genres + class mpsSong: """a class that wraps around the downloading, feature extracting and modeling of a piece of media attached to a mpsUser mpsSong object instances have the following public variables: Modified: musicGrabber/branches/webserv-branch/myspaceuris.py =================================================================== --- musicGrabber/branches/webserv-branch/myspaceuris.py 2009-01-27 13:55:06 UTC (rev 319) +++ musicGrabber/branches/webserv-branch/myspaceuris.py 2009-02-03 15:15:36 UTC (rev 320) @@ -26,7 +26,8 @@ # ### tag terminated by a ';' ### nameTag = """<span class="nametext">""", '''<''' # ### tag term by '<' -genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>\r\n\t\t\t\t\t''', ''' \r''' +genreTag = '''MySpace.Ads.BandType = {''', '''}''' +#genreTag = '''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>\r\n\t\t\t\t\t''', ''' \r''' #'''<font color="#033330" size="1" face="Arial, Helvetica, sans-serif"><strong>''', '''<''' # ### tag terminated by '<' niceURLTag = '''<td><div align="left"> <span><a href="''', '''">''' @@ -70,6 +71,9 @@ #adding this back in to lessen the broken... dbtuneMyspace = 'http://dbtune.org/myspace/' +#new dict for genres valid as of 2009 feb 3 +genreDict = {0:"", 61:"2-step", 59:"A'cappella", 125:"Acousmatic / Tape music", 1:"Acoustic", 73:"Afro-beat", 2:"Alternative", 3:"Ambient", 93:"Americana", 98:"Anime Song", 65:"Big Beat", 51:"Black Metal", 4:"Bluegrass", 5:"Blues", 105:"Bossa Nova", 60:"Breakbeat", 129:"Breakcore", 118:"Celtic", 109:"Children", 134:"Chinese pop", 135:"Chinese traditional", 6:"Christian", 7:"Christian Rap", 8:"Classic Rock", 77:"Classical", 110:"Classical - Opera and Vocal", 9:"Club", 10:"Comedy", 126:"Concrete", 11:"Country", 12:"Death Metal", 63:"Disco House", 70:"Down-tempo", 50:"Drum & Bass", 68:"Dub", 123:"Dutch pop", 67:"Electro", 127:"Electroacoustic", 13:"Electronica", 14:"Emo", 133:"Emotronic", 15:"Experimental", 107:"Flamenco", 16:"Folk", 17:"Folk Rock", 119:"French pop", 18:"Funk", 124:"Fusion", 56:"Garage", 120:"German pop", 79:"Glam", 112:"Gospel", 46:"Gothic", 95:"Grime", 47:"Grindcore", 19:"Grunge", 71:"Happy Hardcore", 57:"Hard House", 20:"Hardcore", 104:"Healing & EasyListening", 21:"Hip Hop", 22:"House", 69:"IDM", 97:"Idol", 23:"Indie", 45:"Industrial", 121:"Italian pop", 24:"Jam Band", 103:"Japanese Classic Music", 100:"Japanese Pop", 25:"Jazz", 58:"Jungle", 101:"Korean Pop", 49:"Latin", 128:"Live Electronics", 75:"Lounge", 113:"Lyrical", 102:"Melodramatic Popular Song", 26:"Metal", 131:"Minimalist", 76:"New Wave", 66:"Nu-Jazz", 27:"Other", 28:"Pop", 29:"Pop Punk", 130:"Post punk", 31:"Powerpop", 32:"Progressive", 62:"Progrsv House", 33:"Psychedelic", 43:"Psychobilly", 34:"Punk", 35:"R&B", 36:"Rap", 37:"Reggae", 111:"Religious", 38:"Rock", 44:"Rockabilly", 94:"Roots Music", 115:"Salsa", 116:"Samba", 39:"Screamo", 78:"Shoegaze", 96:"Showtunes", 40:"Ska", 41:"Soul", 106:"Soundtracks / Film music", 42:"Southern Rock", 122:"Spanish pop", 48:"Surf", 114:"Swing", 108:"Tango", 53:"Techno", 54:"Thrash", 52:"Trance", 132:"Trance", 55:"Trip Hop", 92:"Tropical", 99:"Visual", 117:"Zouk"} + def setRDFStoreURL(url): '''set the rdf uri path''' rdfStoreURL = url This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |