Diff of /plugin.video.disclose_tv/resources/lib/scraper.py [000000] .. [6f2280]  Maximize  Restore

Switch to side-by-side view

--- a
+++ b/plugin.video.disclose_tv/resources/lib/scraper.py
@@ -0,0 +1,100 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+#     Copyright (C) 2012 Tristan Fischer (sphere@dersphere.de)
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+import simplejson
+from BeautifulSoup import BeautifulSoup
+from urllib2 import urlopen
+
+MAIN_URL = 'http://www.disclose.tv/'
+
+
+class Scraper:
+
+    def get_video_topics(self):
+        log('get_video_topics started')
+        path = 'action/videolist/page/1/all/filter/'
+        url = MAIN_URL + path
+        tree = self.__get_tree(url)
+        ul = tree.find('ul', {'id': 'videos-media-box-filter'})
+        topics = []
+        for li in ul.findAll('li'):
+            topics.append({
+                'title': li.a.string,
+                'id': li.a['href'].split('/')[5]
+            })
+        return topics
+
+    def get_videos(self, topic_id, page):
+        log('get_videos_by_topic_id started with topic_id=%s' % topic_id)
+        url = MAIN_URL + 'action/videolist/page/%d/%s/filter/' % (
+            int(page), topic_id
+        )
+        tree = self.__get_tree(url)
+        div = tree.find('div', {'id': 'videos-media-box-list'})
+        videos = []
+        for li in div.findAll('li'):
+            a = li.find('a')
+            title = a['title']
+            video_id, path = a['href'].split('/')[3:5]
+            span_content = li.find('span', {'class': 'types typeV'}).contents
+            if len(span_content) == 1:
+                duration = span_content[0].split(' ')[1]
+            elif len(span_content) == 2:
+                duration = span_content[1]
+            else:
+                duration = ''
+            videos.append({
+                'id': video_id,
+                'thumbnail': self.__img(li.find('img')['src']),
+                'path': path,
+                'title': title,
+                'duration': duration.strip()
+            })
+        return videos
+
+    def get_video_url(self, video_id):
+        url = MAIN_URL + '/videos/config/video/%s.js' % video_id
+        json = self.__get_json(url)
+        return json['clip']['url']
+
+    @staticmethod
+    def __img(url):
+        return url.replace('135x76', '').split('?')[0]
+
+    def __get_json(self, url):
+        html = self.__get_url(url)
+        return simplejson.loads(html)
+
+    def __get_tree(self, url):
+        html = self.__get_url(url)
+        return BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
+
+    def __get_url(self, url):
+        log('__get_url opening url: %s' % url)
+        try:
+            html = urlopen(url).read()
+        except HTTPError, error:
+            log('__urlopen HTTPError: %s' % error)
+            raise NetworkError('HTTPError: %s' % error)
+        log('__get_url got %d bytes' % len(html))
+        return html
+
+
+def log(text):
+    print u'Scraper: %s' % text

Get latest updates about Open Source Projects, Conferences and News.

Sign up for the SourceForge newsletter:





No, thanks