[6f2280]: plugin.video.disclose_tv / resources / lib / scraper.py  Maximize  Restore  History

Download this file

101 lines (87 with data), 3.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Tristan Fischer (sphere@dersphere.de)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import simplejson
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen
MAIN_URL = 'http://www.disclose.tv/'
class Scraper:
def get_video_topics(self):
log('get_video_topics started')
path = 'action/videolist/page/1/all/filter/'
url = MAIN_URL + path
tree = self.__get_tree(url)
ul = tree.find('ul', {'id': 'videos-media-box-filter'})
topics = []
for li in ul.findAll('li'):
topics.append({
'title': li.a.string,
'id': li.a['href'].split('/')[5]
})
return topics
def get_videos(self, topic_id, page):
log('get_videos_by_topic_id started with topic_id=%s' % topic_id)
url = MAIN_URL + 'action/videolist/page/%d/%s/filter/' % (
int(page), topic_id
)
tree = self.__get_tree(url)
div = tree.find('div', {'id': 'videos-media-box-list'})
videos = []
for li in div.findAll('li'):
a = li.find('a')
title = a['title']
video_id, path = a['href'].split('/')[3:5]
span_content = li.find('span', {'class': 'types typeV'}).contents
if len(span_content) == 1:
duration = span_content[0].split(' ')[1]
elif len(span_content) == 2:
duration = span_content[1]
else:
duration = ''
videos.append({
'id': video_id,
'thumbnail': self.__img(li.find('img')['src']),
'path': path,
'title': title,
'duration': duration.strip()
})
return videos
def get_video_url(self, video_id):
url = MAIN_URL + '/videos/config/video/%s.js' % video_id
json = self.__get_json(url)
return json['clip']['url']
@staticmethod
def __img(url):
return url.replace('135x76', '').split('?')[0]
def __get_json(self, url):
html = self.__get_url(url)
return simplejson.loads(html)
def __get_tree(self, url):
html = self.__get_url(url)
return BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
def __get_url(self, url):
log('__get_url opening url: %s' % url)
try:
html = urlopen(url).read()
except HTTPError, error:
log('__urlopen HTTPError: %s' % error)
raise NetworkError('HTTPError: %s' % error)
log('__get_url got %d bytes' % len(html))
return html
def log(text):
print u'Scraper: %s' % text

Get latest updates about Open Source Projects, Conferences and News.

Sign up for the SourceForge newsletter:





No, thanks