[Assorted-commits] SF.net SVN: assorted:[1596] movie-lookup/trunk

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1596
          http://assorted.svn.sourceforge.net/assorted/?rev=1596&view=rev
Author:   yangzhang
Date:     2010-04-01 18:18:17 +0000 (Thu, 01 Apr 2010)

Log Message:
-----------
added old python source from the old sharing-gateway project

Added Paths:
-----------
    movie-lookup/trunk/oldsrc/
    movie-lookup/trunk/oldsrc/format-movie-info.py
    movie-lookup/trunk/oldsrc/get-movie-info.py

Added: movie-lookup/trunk/oldsrc/format-movie-info.py
===================================================================

--- movie-lookup/trunk/oldsrc/format-movie-info.py	                        (rev 0)
+++ movie-lookup/trunk/oldsrc/format-movie-info.py	2010-04-01 18:18:17 UTC (rev 1596)
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+
+import cPickle as pickle
+import re
+
+f = file( 'output.txt' )
+items = pickle.load( f )
+f.close()
+
+print len( items )
+for item in items[:10]:
+	print item['orig_name']
+for item in items[-10:]:
+	print item['orig_name']


Property changes on: movie-lookup/trunk/oldsrc/format-movie-info.py
___________________________________________________________________
Added: svn:executable
   + *

Added: movie-lookup/trunk/oldsrc/get-movie-info.py
===================================================================
--- movie-lookup/trunk/oldsrc/get-movie-info.py	                        (rev 0)
+++ movie-lookup/trunk/oldsrc/get-movie-info.py	2010-04-01 18:18:17 UTC (rev 1596)
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+
+import cPickle as pickle
+import HTMLParser
+import re
+import sys
+import urllib
+import urllib2
+import twisted.python.text as text
+
+def urlopen( *args ):
+	while True:
+		try:
+			page = urllib2.urlopen( *args )
+			return page.geturl(), page.readlines()
+		except KeyboardInterrupt:
+			raise
+		except:
+			sys.stderr.write( 'connection error, retrying...\n' )
+
+class SearchParser( HTMLParser.HTMLParser ):
+
+	def __init__( self, query ):
+		HTMLParser.HTMLParser.__init__( self )
+		self.query = query
+		self.title = None
+		self.is_match = False
+
+	def handle_starttag( self, tag, attrs ):
+		attrs = dict( attrs )
+		if tag == 'a' and 'class' in attrs and (
+				attrs['class'] == 'table-hl-header-link' or
+				attrs['class'] == 'movie-link' ):
+					self.next_url = attrs['href']
+
+	def handle_data( self, data ):
+		self.title = data
+		if self.query == simplify( data ):
+			self.is_match = True
+
+class ReviewsParser( HTMLParser.HTMLParser ):
+
+	def __init__( self ):
+		HTMLParser.HTMLParser.__init__( self )
+		self.do_extract = False
+
+	def handle_starttag( self, tag, attrs ):
+		if tag == 'span':
+			attrs = dict( attrs )
+			if 'class' in attrs and attrs[ 'class' ] == 'movie-body-text-bold':
+				self.do_extract = True
+
+	def handle_data( self, data ):
+		if self.do_extract:
+			self.rating = data
+			self.do_extract = False
+
+class AboutParser( HTMLParser.HTMLParser ):
+
+	def __init__( self ):
+		HTMLParser.HTMLParser.__init__( self )
+		self.do_extract = False
+		self.summary = None
+		self.span_level = 0
+		self.start_span_level = None
+
+	def handle_starttag( self, tag, attrs ):
+		if tag == 'span':
+			attrs = dict( attrs )
+			if 'class' in attrs and attrs[ 'class' ] == 'movie-body-text':
+				assert self.start_span_level == None
+				self.do_extract = True
+				self.start_span_level = self.span_level
+			self.span_level += 1
+
+	def handle_endtag( self, tag ):
+		if tag == 'span':
+			self.span_level -= 1
+			if self.span_level == self.start_span_level:
+				self.do_extract = False
+
+	def handle_data( self, data ):
+		if self.do_extract:
+			if self.summary is None:
+				self.summary = data
+			else:
+				self.summary += '\n' + data
+
+def simplify( text ):
+	def clean( text ):
+		text = re.sub( r'\s+', ' ', text )
+		text = re.sub( r'^\s|\s$', '', text )
+		return text
+	text = text.strip().lower()
+	text = re.sub( r'^(a|an|the)\b', '', text )
+	text = re.sub( r'\(\d{4}\)$', '', text )
+	text = clean( text )
+	text = re.sub( r', (a|an|the)$', '', text )
+	text = re.sub( r'[-,\.:]', ' ', text )
+	text = re.sub( r'\(.*\)$', '', text )
+	text = clean( text )
+	return text
+
+def lookup( name ):
+	global appfile
+	item = {}
+	item[ 'orig_name' ] = name
+	item[ 'simp_name' ] = simplify( name )
+	name = simplify( name )
+	if re.search( r"[^0-9a-z' ]", name ) is not None:
+		print '\t', "ERROR PARSING NAME"
+		item[ 'parse_error' ] = True
+	else:
+		url, lines = urlopen( 'http://www.rottentomatoes.com/search/movie.php?%s' %
+				( urllib.urlencode( { 'searchby': 'movies', 'search': name } ), ) )
+		matches = []
+		has_match = False
+		base_url = None
+		item[ 'search_url '] = url
+		item[ 'search_lines '] = lines
+		for line in lines:
+			if '<h1>' in line:
+				has_match = True
+				base_url = url
+				break
+			if 'table-hl-header-link' in line or 'movie-link' in line:
+				line = line[ line.find( '<a' ) : line.find( '</a>' ) + 4 ]
+				parser = SearchParser( name )
+				parser.feed( line )
+				parser.close()
+
+				if parser.title is not None:
+					matches.append( ( parser.title, parser.next_url ) )
+					if not has_match and parser.is_match:
+						has_match = True
+						item[ 'best_match' ] = (parser.title, parser.next_url)
+						next_url = parser.next_url
+
+		item[ 'has_match' ] = has_match
+		item[ 'matches' ] = matches
+		if not has_match:
+			print '\t', 'CANNOT RESOLVE MATCHES'
+			for match in matches:
+				print '\t', match
+		else:
+			if base_url is None:
+				base_url = 'http://www.rottentomatoes.com' + next_url
+				reviews_url = base_url
+				url, lines = urlopen( reviews_url )
+			else:
+				reviews_url = base_url
+			print '\t', base_url
+			item[ 'base_url' ] = base_url
+
+			item[ 'reviews_url' ] = reviews_url
+			item[ 'reviews_lines' ] = lines
+			for line in lines:
+				if 'movie-body-text-bold' in line:
+					reviews_parser = ReviewsParser()
+					reviews_parser.feed( line )
+					reviews_parser.close()
+					item[ 'rating' ] = reviews_parser.rating
+					break
+
+			about_url = base_url + 'about.php'
+			url, lines = urlopen( about_url )
+			item[ 'about_url' ] = about_url
+			item[ 'about_lines' ] = lines
+			start_line = None
+			for i, line in enumerate( lines ):
+				if 'movie-body-text' in line:
+					start_line = i
+				if start_line is not None and '</span>' in line:
+					rest = ''.join( lines[ start_line : i + 1 ] )
+					about_parser = AboutParser()
+					about_parser.feed( rest )
+					about_parser.close()
+					item[ 'summary' ] = about_parser.summary
+					break
+
+	pickle.dump( item, appfile )
+
+	return item
+
+def main( argv = sys.argv ):
+	global appfile
+	items = []
+	min = 0
+	if len( argv ) > 1:
+		min = int( argv[1] )
+
+	f = file( 'movies.txt' )
+	names = f.readlines() # [ simplify(line) for line in f.xreadlines() ]
+	f.close()
+	appfile = file( 'append.txt', 'w' )
+	for i, name in enumerate( names ):
+		if i >= min: # and i == 31:
+			print i, ':', name
+			item = lookup( name )
+			items.append( item )
+			print
+	appfile.close()
+
+	outfile = file( 'output.txt', 'w' )
+	pickle.dump( items, outfile )
+	outfile.close()
+
+if __name__ == '__main__':
+	sys.exit( main() )


Property changes on: movie-lookup/trunk/oldsrc/get-movie-info.py
___________________________________________________________________
Added: svn:executable
   + *


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.