[Assorted-commits] SF.net SVN: assorted:[1596] movie-lookup/trunk
Brought to you by:
yangzhang
From: <yan...@us...> - 2010-04-01 18:18:24
|
Revision: 1596 http://assorted.svn.sourceforge.net/assorted/?rev=1596&view=rev Author: yangzhang Date: 2010-04-01 18:18:17 +0000 (Thu, 01 Apr 2010) Log Message: ----------- added old python source from the old sharing-gateway project Added Paths: ----------- movie-lookup/trunk/oldsrc/ movie-lookup/trunk/oldsrc/format-movie-info.py movie-lookup/trunk/oldsrc/get-movie-info.py Added: movie-lookup/trunk/oldsrc/format-movie-info.py =================================================================== --- movie-lookup/trunk/oldsrc/format-movie-info.py (rev 0) +++ movie-lookup/trunk/oldsrc/format-movie-info.py 2010-04-01 18:18:17 UTC (rev 1596) @@ -0,0 +1,14 @@ +#!/usr/bin/env python + +import cPickle as pickle +import re + +f = file( 'output.txt' ) +items = pickle.load( f ) +f.close() + +print len( items ) +for item in items[:10]: + print item['orig_name'] +for item in items[-10:]: + print item['orig_name'] Property changes on: movie-lookup/trunk/oldsrc/format-movie-info.py ___________________________________________________________________ Added: svn:executable + * Added: movie-lookup/trunk/oldsrc/get-movie-info.py =================================================================== --- movie-lookup/trunk/oldsrc/get-movie-info.py (rev 0) +++ movie-lookup/trunk/oldsrc/get-movie-info.py 2010-04-01 18:18:17 UTC (rev 1596) @@ -0,0 +1,209 @@ +#!/usr/bin/env python + +import cPickle as pickle +import HTMLParser +import re +import sys +import urllib +import urllib2 +import twisted.python.text as text + +def urlopen( *args ): + while True: + try: + page = urllib2.urlopen( *args ) + return page.geturl(), page.readlines() + except KeyboardInterrupt: + raise + except: + sys.stderr.write( 'connection error, retrying...\n' ) + +class SearchParser( HTMLParser.HTMLParser ): + + def __init__( self, query ): + HTMLParser.HTMLParser.__init__( self ) + self.query = query + self.title = None + self.is_match = False + + def handle_starttag( self, tag, attrs ): + attrs = dict( attrs ) + if tag == 'a' and 'class' in attrs and ( + attrs['class'] == 'table-hl-header-link' or + attrs['class'] == 'movie-link' ): + self.next_url = attrs['href'] + + def handle_data( self, data ): + self.title = data + if self.query == simplify( data ): + self.is_match = True + +class ReviewsParser( HTMLParser.HTMLParser ): + + def __init__( self ): + HTMLParser.HTMLParser.__init__( self ) + self.do_extract = False + + def handle_starttag( self, tag, attrs ): + if tag == 'span': + attrs = dict( attrs ) + if 'class' in attrs and attrs[ 'class' ] == 'movie-body-text-bold': + self.do_extract = True + + def handle_data( self, data ): + if self.do_extract: + self.rating = data + self.do_extract = False + +class AboutParser( HTMLParser.HTMLParser ): + + def __init__( self ): + HTMLParser.HTMLParser.__init__( self ) + self.do_extract = False + self.summary = None + self.span_level = 0 + self.start_span_level = None + + def handle_starttag( self, tag, attrs ): + if tag == 'span': + attrs = dict( attrs ) + if 'class' in attrs and attrs[ 'class' ] == 'movie-body-text': + assert self.start_span_level == None + self.do_extract = True + self.start_span_level = self.span_level + self.span_level += 1 + + def handle_endtag( self, tag ): + if tag == 'span': + self.span_level -= 1 + if self.span_level == self.start_span_level: + self.do_extract = False + + def handle_data( self, data ): + if self.do_extract: + if self.summary is None: + self.summary = data + else: + self.summary += '\n' + data + +def simplify( text ): + def clean( text ): + text = re.sub( r'\s+', ' ', text ) + text = re.sub( r'^\s|\s$', '', text ) + return text + text = text.strip().lower() + text = re.sub( r'^(a|an|the)\b', '', text ) + text = re.sub( r'\(\d{4}\)$', '', text ) + text = clean( text ) + text = re.sub( r', (a|an|the)$', '', text ) + text = re.sub( r'[-,\.:]', ' ', text ) + text = re.sub( r'\(.*\)$', '', text ) + text = clean( text ) + return text + +def lookup( name ): + global appfile + item = {} + item[ 'orig_name' ] = name + item[ 'simp_name' ] = simplify( name ) + name = simplify( name ) + if re.search( r"[^0-9a-z' ]", name ) is not None: + print '\t', "ERROR PARSING NAME" + item[ 'parse_error' ] = True + else: + url, lines = urlopen( 'http://www.rottentomatoes.com/search/movie.php?%s' % + ( urllib.urlencode( { 'searchby': 'movies', 'search': name } ), ) ) + matches = [] + has_match = False + base_url = None + item[ 'search_url '] = url + item[ 'search_lines '] = lines + for line in lines: + if '<h1>' in line: + has_match = True + base_url = url + break + if 'table-hl-header-link' in line or 'movie-link' in line: + line = line[ line.find( '<a' ) : line.find( '</a>' ) + 4 ] + parser = SearchParser( name ) + parser.feed( line ) + parser.close() + + if parser.title is not None: + matches.append( ( parser.title, parser.next_url ) ) + if not has_match and parser.is_match: + has_match = True + item[ 'best_match' ] = (parser.title, parser.next_url) + next_url = parser.next_url + + item[ 'has_match' ] = has_match + item[ 'matches' ] = matches + if not has_match: + print '\t', 'CANNOT RESOLVE MATCHES' + for match in matches: + print '\t', match + else: + if base_url is None: + base_url = 'http://www.rottentomatoes.com' + next_url + reviews_url = base_url + url, lines = urlopen( reviews_url ) + else: + reviews_url = base_url + print '\t', base_url + item[ 'base_url' ] = base_url + + item[ 'reviews_url' ] = reviews_url + item[ 'reviews_lines' ] = lines + for line in lines: + if 'movie-body-text-bold' in line: + reviews_parser = ReviewsParser() + reviews_parser.feed( line ) + reviews_parser.close() + item[ 'rating' ] = reviews_parser.rating + break + + about_url = base_url + 'about.php' + url, lines = urlopen( about_url ) + item[ 'about_url' ] = about_url + item[ 'about_lines' ] = lines + start_line = None + for i, line in enumerate( lines ): + if 'movie-body-text' in line: + start_line = i + if start_line is not None and '</span>' in line: + rest = ''.join( lines[ start_line : i + 1 ] ) + about_parser = AboutParser() + about_parser.feed( rest ) + about_parser.close() + item[ 'summary' ] = about_parser.summary + break + + pickle.dump( item, appfile ) + + return item + +def main( argv = sys.argv ): + global appfile + items = [] + min = 0 + if len( argv ) > 1: + min = int( argv[1] ) + + f = file( 'movies.txt' ) + names = f.readlines() # [ simplify(line) for line in f.xreadlines() ] + f.close() + appfile = file( 'append.txt', 'w' ) + for i, name in enumerate( names ): + if i >= min: # and i == 31: + print i, ':', name + item = lookup( name ) + items.append( item ) + print + appfile.close() + + outfile = file( 'output.txt', 'w' ) + pickle.dump( items, outfile ) + outfile.close() + +if __name__ == '__main__': + sys.exit( main() ) Property changes on: movie-lookup/trunk/oldsrc/get-movie-info.py ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |