[ca488f]: plugin.video.4players / resources / lib / scraper.py  Maximize  Restore  History

Download this file

128 lines (116 with data), 4.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import urllib2
import re
from BeautifulSoup import BeautifulSoup
from urllib import urlencode
IPAD_USERAGENT = (u'Mozilla/5.0 (iPad; U; CPU OS OS 3_2 like '
u'Mac OS X; en-us) AppleWebKit/531.21.10 (K'
u'HTML, like Gecko) Version/4.0.4 Mobile/7B'
u'367 Safari/531.21.10')
CATEGORIES = ('Alle', 'TopViews', 'TopRated', 'PC-CDROM',
'PlayStation2', 'PlayStation3', 'Wii', '360', 'NDS',
'PSP', 'Video-Fazit')
URL_PREFIX = 'http://www.4players.de/4players.php/tvplayer/'
def getVideos(filter=None, page=1):
if filter not in CATEGORIES:
filter = CATEGORIES[0]
post = {'currentpage': str(int(page) - 1),
'singlefilter': filter,
'funcname': 'aktuellevideos',
'numcols': 5,
'numshown': 50,
'refreshskims': 1}
url = 'http://www.4players.de/paginatecontent.php'
html = __getAjaxContent(url, post)
tree = BeautifulSoup(html)
# last_page_num
page_links = tree.findAll('a', {'class': 'pagenavi'})
last_page_num = max([page_num.contents[0] for page_num in page_links \
if page_num.contents[0].isdigit()])
# videos
section = tree.find('div', {'class': re.compile('tv-weitere-container')})
video_frames = section.findAll('li')
videos = list()
for frame in video_frames:
print frame
link = frame.find('a', {'class': re.compile('tv-weiter-link')})
# title
title = link['title']
# url
video_page = link['href']
url = video_page.replace(URL_PREFIX, '').replace('.html', '')
# rating
rating_div = frame.find('div', {'class':
re.compile('^tv-weitere-rating')})
if rating_div['class'][-7:-6] in str(range(1, 6)):
rating = int(rating_div['class'][-7:-6])
else:
rating = 0
# views
views_div = frame.find('div', {'class':
re.compile('^tv-weitere-views')})
r = 'Views: (?P<views>[0-9]+)'
m = re.search(r, unicode(views_div))
if m:
views = int(m.groupdict()['views'])
else:
views = 0
# image
skim_div = frame.find('div', {'class': 'skim'})
if skim_div:
image = skim_div['data-skimimageurl'].replace('skimimage', 'thumb160x90')
# try to guess the thumb
# date
date_div = frame.find('div', {'class':
re.compile('^tv-weitere-datum')})
r = '(?P<day>[0-9]+)\.(?P<month>[0-9]+)\.(?P<year>20[0-9]+)'
m = re.search(r, unicode(date_div))
if m:
date_dict = m.groupdict()
date = '%s.%s.%s' % (date_dict['day'],
date_dict['month'],
date_dict['year'])
else:
date = ''
# length
len_div = frame.find('div', {'class':
re.compile('^tv-weitere-laufzeit')})
r = '(?P<min>[0-9]+):(?P<sec>[0-9]+) (Min\.|min|MIn\.)'
m = re.search(r, unicode(len_div))
if m:
length_dict = m.groupdict()
length = '%s:%s' % (length_dict['min'], length_dict['sec'])
else:
length = '0:00'
# finalize
videos.append({'title': title,
'image': image,
'url': url,
'rating': rating,
'views': views,
'date': date,
'length': length})
return videos, last_page_num
def __getAjaxContent(url, data_dict=None):
if data_dict:
post_data = urlencode(data_dict)
else:
post_data = ' '
req = urllib2.Request(url, post_data)
req.add_header('User-Agent', IPAD_USERAGENT)
req.add_header('Accept', 'text/javascript, */*')
req.add_header('Content-Type',
'application/x-www-form-urlencoded; charset=UTF-8')
req.add_header('X-Requested-With', 'XMLHttpRequest')
response = urllib2.urlopen(req).read()
return response
def getVideoFile(page_url):
video_page = URL_PREFIX + page_url + '.html'
html = __getAjaxContent(video_page)
#tree = BeautifulSoup(html)
#link = tree.find('script', text=re.compile('video src'))
r = '<video src="(?P<url>[^"]+)"'
m = re.search(r,html)
url = m.groupdict()['url']
return url
def getCategories():
return CATEGORIES