Learn how easy it is to sync an existing GitHub or Google Code repo to a SourceForge project! See Demo

Close

[1830f2]: plugin.video.conectate / resources / lib / videos_scraper.py Maximize Restore History

Download this file

videos_scraper.py    196 lines (174 with data), 9.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Andres Trapanotto (andres.trapanotto@gmail.com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Note: Some media titles had the <em> </em> html included. This is
# stripped by a hardcoded regex.
# ToDo:
# * To strip title html tags with BeautifulSoup
#
import datetime, re
from urllib import urlencode
from urllib2 import urlopen, Request
from BeautifulSoup import BeautifulSoup
USER_AGENT = (
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 '
'(KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7'
)
REFERER = 'http://www.conectate.gob.ar/'
VIDEO_LANDING_URL = (
'http://www.conectate.gob.ar/'
)
class Scraper(object):
def __init__(self, force_refresh=False):
self.force_refresh = force_refresh
self.requests = {}
# Get show names from site index
def get_show_names(self, menuargc, page):
log('get_show_names started for %s' % menuargc)
# url = 'http://www.conectate.gob.ar/educar-portal-video-web/module/busqueda/busquedaAvanzada.do?modulo=menu&temaCanalId=1&canalId=1&tipoEmisionId=3&pagina=1'
url = 'http://www.conectate.gob.ar/educar-portal-video-web/module/busqueda/busquedaAvanzada.do?modulo=menu&%s&pagina=%s' % (menuargc, page)
html = self.__get_url(url)
programas_ids = re.compile('<a href=".+?(idRecurso=\d+?)"><img src=".+?" alt=".+?" class=".+?" /></a>').findall(html)
programas_titles = re.compile('<a href=".+?idRecurso=\d+?"><img src=".+?" alt="(.+?)" class=".+?" /></a>').findall(html)
programas_icons = re.compile('<a href=".+?idRecurso=\d+?"><img src="(.+?)" alt=".+?" class=".+?" /></a>').findall(html)
# log('get_show_names len(programas_titles)= %d ' % len(programas_titles))
video_topics = []
for programas_id, programas_icon, programas_title in zip(programas_ids, programas_icons, programas_titles):
video_topics.append({
'id': programas_id,
'name': str(BeautifulSoup(programas_title,convertEntities=BeautifulSoup.HTML_ENTITIES)),
'icon': programas_icon
})
# log('get_show_names got %d topics' % len(video_topics))
return video_topics
def get_episodes_by_show_name(self, show_name):
log('get_episodes_by_show_name started')
url = 'http://www.conectate.gob.ar/educar-portal-video-web/module/detalleRecurso/DetalleRecurso.do?%s' % show_name
html = self.__get_url(url)
programas_ids = re.compile('<a href=".+?(idRecurso=\d+?)">.+?</a>').findall(html)
programas_titles = re.compile('<a href=".+?idRecurso=\d+?">(.+?)</a>').findall(html)
programas_icon = re.compile('<img src="(.+?image_id=.+?)" />').findall(html)
# programas_description = ''
# programas_description = re.compile('<p>\s*(.+?)\r\s*</p>').findall(html)
programas_description = re.compile('<p>(.+?)</p>').findall(html)[0]
videos = []
for programas_id, programas_title in zip(programas_ids, programas_titles):
videos.append({
'id': programas_id,
'title': str(BeautifulSoup(programas_title,convertEntities=BeautifulSoup.HTML_ENTITIES)),
'thumbnail': programas_icon[0],
'count': 1,
'duration': '',
'originaltitle': str(BeautifulSoup(programas_title,convertEntities=BeautifulSoup.HTML_ENTITIES)),
'description': programas_description,
'date': '',
'filesize': '',
'author': '',
'genres': ''
})
log('get_episodes_by_show_name got %d videos' % len(videos))
# print videos
return videos
def get_single_episodes(self, menuargc, page):
log('get_single_episodes started')
url = 'http://conectate.gob.ar/educar-portal-video-web/module/busqueda/busquedaAvanzada.do?%s&pagina=%s' % (menuargc, page)
html = self.__get_url(url)
programas_ids = re.compile('<a href=".+?(idRecurso=\d+?)"><img src=".+?" alt=".+?" .+?/></a>').findall(html)
programas_titles = re.compile('<a href=".+?idRecurso=\d+?"><img src=".+?" alt="(?:<em>)?(.+?)(?:</em>)?" .+?/></a>').findall(html)
programas_icons = re.compile('<a href=".+?idRecurso=\d+?"><img src="(.+?)" alt=".+?" .+?/></a>').findall(html)
# programas_descriptions = re.compile('<p>\s*(.+?)\r\s*</p>').findall(html)
programas_descriptions = re.compile('<p>(.+?)</p>').findall(html)
# log('get_show_names len(programas_ids)= %d ' % len(programas_ids))
# log('get_show_names len(programas_titles)= %d ' % len(programas_titles))
videos = []
for programas_id, programas_title, programas_icon, programas_description in zip(programas_ids, programas_titles, programas_icons, programas_descriptions):
videos.append({
'id': programas_id,
'title': str(BeautifulSoup(programas_title,convertEntities=BeautifulSoup.HTML_ENTITIES)),
'thumbnail': programas_icon,
'duration': '',
'originaltitle': str(BeautifulSoup(programas_title,convertEntities=BeautifulSoup.HTML_ENTITIES)),
'description': programas_description,
'date': '',
'filesize': '',
'author': '',
'genres': ''
})
log('get_single_episodes got %d videos' % len(videos))
# print videos
return videos
def get_video(self, id):
import xbmcaddon
log('get_video started with id=%s' % id)
addon = xbmcaddon.Addon(id='plugin.video.conectate')
#url = 'http://www.conectate.gob.ar/educar-portal-video-web/module/detalleRecurso/DetalleRecurso.do?canalId=1&modulo=menu&temaCanalId=1&tipoEmisionId=3&recursoPadreId=50027&idRecurso=50030'
url = 'http://www.conectate.gob.ar/educar-portal-video-web/module/detalleRecurso/DetalleRecurso.do?%s' % id
html = self.__get_url(url)
media_sd = re.compile('\'file\' \: \'(.+?)\'').findall(html)[0]
media_hd = re.compile('\'hd.file\' \: \'(.+?)\'').findall(html)[0]
hd_setting_value = addon.getSetting('hd_setting')
# log('get_video hd_setting_value=%r' % hd_setting_value )
# log('get_video media_hd=%s' % media_hd )
if (media_hd and ( hd_setting_value == 'true' )):
media_url = media_hd
else:
media_url = media_sd
media_sub_url = re.compile('\'captions.file\' \: \'(.+?)\'').findall(html)
if (len(media_sub_url) == 1):
media_sub=media_sub_url[0]
else:
media_sub = ''
media_title = re.compile('<title>(.+?).\|.+?</title>').findall(html)
# media_title = re.compile('<title>(?:<em>)?(.+?).(?:<em>)?.?\|.+?</title>').findall(html)
media_thumb = re.compile('<img src=".+?repositorio/Imagen/ver.+?" />').findall(html)
media_description = re.compile('<!p>\s*(.+?)\r\s*</p>').findall(html)
video = {
'title': str(BeautifulSoup(media_title[0],convertEntities=BeautifulSoup.HTML_ENTITIES)),
'thumbnail': media_thumb[0],
'url': media_url,
'url_sub': media_sub,
'description': str(BeautifulSoup(media_description[0],convertEntities=BeautifulSoup.HTML_ENTITIES))
}
log('get_video finished')
# print video
return video
def __get_url(self, url, get_dict='', post_dict=''):
log('__get_url started with url=%s, get_dict=%s, post_dict=%s'
% (url, get_dict, post_dict))
uid = '%s-%s-%s' % (url, urlencode(get_dict), urlencode(post_dict))
if uid in self.requests.keys():
log('__get_url using cache for url=%s' % url)
response = self.requests[uid]
else:
if get_dict:
full_url = '%s?%s' % (url, urlencode(get_dict))
else:
full_url = url
req = Request(full_url)
req.add_header('User-Agent', USER_AGENT)
req.add_header('Referer', REFERER)
log('__get_url opening url=%s' % full_url)
if post_dict:
response = urlopen(req, urlencode(post_dict)).read()
else:
response = urlopen(req).read()
self.requests[uid] = response
log('__get_url finished with %d bytes result' % len(response))
return response
def log(text):
print 'Conectate.gob.ar scraper: %s' % text