From: Stefan B. <ste...@us...> - 2014-06-14 18:18:39
|
Update of /cvsroot/xmltv/xmltv/grab/fi/fi/source In directory sfp-cvs-1.v30.ch3.sourceforge.com:/tmp/cvs-serv31409/grab/fi/fi/source Modified Files: tvhs.pm yle.pm Log Message: - yle: updated after site changed completely - NOTE: yle has new channel IDs! - test.conf: updated with latest full channel list - test.sh: reduced full run from 9 to 7 days Index: yle.pm =================================================================== RCS file: /cvsroot/xmltv/xmltv/grab/fi/fi/source/yle.pm,v retrieving revision 2.02 retrieving revision 2.03 diff -C2 -d -r2.02 -r2.03 *** yle.pm 1 Nov 2013 22:55:13 -0000 2.02 --- yle.pm 14 Jun 2014 18:18:36 -0000 2.03 *************** *** 13,16 **** --- 13,17 ---- use strict; use warnings; + use Date::Manip::Date; BEGIN { *************** *** 25,68 **** sub description { 'yle.fi' } - # yle.fi offers program guides in multiple languages - # language URL attribute - # | XMLTV language code - # | | - my %languages = ( - fi => "fi", - se => "sv", - ); - # Grab channel list sub channels { my %channels; ! # For each language ! while (my($language, $code) = each %languages) { # Fetch & parse HTML ! my $root = fetchTree("http://ohjelmaopas.yle.fi/?lang=$language"); if ($root) { # ! # Channel list can be found from this dropdown: # ! # <select name="week" id="viikko_dropdown" class="dropdown"> ! # <option value="">Valitse kanava</option> ! # <option value="tv1">YLE TV1</option> ! # ... ! # <option value="tvf">TV Finland (CET)</option> ! # </select> # ! if (my $container = $root->look_down("id" => "viikko_dropdown")) { ! if (my @options = $container->find("option")) { ! debug(2, "Source ${language}.yle.fi found " . scalar(@options) . " channels"); ! foreach my $option (@options) { ! my $id = $option->attr("value"); ! my $name = $option->as_text(); if (defined($id) && length($id) && length($name)) { debug(3, "channel '$name' ($id)"); ! $channels{"${id}.${language}.yle.fi"} = "$code $name"; } } --- 26,59 ---- sub description { 'yle.fi' } # Grab channel list sub channels { my %channels; ! # yle.fi offers program guides in multiple languages ! foreach my $code ("fi", "sv") { # Fetch & parse HTML ! my $root = fetchTree("http://ohjelmaopas.yle.fi/tv/lang/$code?previousUrl=/tv/opas"); if ($root) { # ! # Channel list can be found from this list: # ! # <ul class="channel-lists ..."> ! # <li><h1 id="yle-tv1">Yle TV1...</h1>...</li> ! # <li><h1 id="yle-tv2">Yle TV2...</h1>...</li> ! # ... ! # </ul> # ! if (my $container = $root->look_down("class" => qr/^channel-lists\s+/)) { ! if (my @headers = $container->find("h1")) { ! debug(2, "Source ${code}.yle.fi found " . scalar(@headers) . " channels"); ! foreach my $header (@headers) { ! my $id = $header->attr("id"); ! my $name = $header->as_text(); if (defined($id) && length($id) && length($name)) { debug(3, "channel '$name' ($id)"); ! $channels{"${id}.${code}.yle.fi"} = "$code $name"; } } *************** *** 82,121 **** } - # Category map - my $category_map; - - # Parse categories and compile map - # - # <div id="aihe_list"> - # <a href="#" style='color:#29a8db;'" class="aihe_linkki" id="aihe_linkki_0">Kaikki</a> - # <a href="#" " class="aihe_linkki" id="aihe_linkki_1">Uutiset</a> - # <a href="#" " class="aihe_linkki" id="aihe_linkki_2">Ajankohtais</a> - # ... - # <a href="#" " class="aihe_linkki" id="aihe_linkki_10">Viihde ja musiikki</a> - # </div> - sub _parseCategories($$) { - my($root, $language) = @_; - if (my $container = $root->look_down("id" => "aihe_list")) { - if (my @hrefs = $container->find("a")) { - debug(2, "Source ${language}.yle.fi found " . scalar(@hrefs) . " categories"); - foreach my $href (@hrefs) { - my $id = $href->attr("id"); - my $name = $href->as_text(); - - # Ignore category 0 (kaikki) - my $category; - if (defined($id) && - (($category) = ($id =~ /^aihe_linkki_(\d+)$/)) && - $category && - length($name)) { - debug(3, "category $language '$name' ($category)"); - $category_map->{$language}->{$category} = $name; - } - } - } - } - return($category_map->{$language}); - } - # Grab one day sub grab { --- 73,76 ---- *************** *** 123,217 **** # Get channel number from XMLTV id ! return unless my($channel, $language) = ($id =~ /^([^.]+)\.([^.]+)\.yle\.fi$/); ! ! # Select language ! return unless exists $languages{$language}; ! my $code = $languages{$language}; ! # Fetch & parse HTML ! my $root = fetchTree("http://ohjelmaopas.yle.fi/?lang=$language&groups=$channel&d=$today"); if ($root) { ! my $map = $category_map->{$language}; ! ! # Only parse category list once ! $map = _parseCategories($root, $language) unless defined $map; # ! # Each programme can be found in a separate <div> node ! # ! # The class is a combination of ! # programme - literal ! # clear - encryption? ! # (onair) - this programme is currently on the air ! # catN - category type? ! # ! # <div class="programme clear onair cat1" style=""> ! # <div class="start">18.00</div> ! # <div class="title"> ! # <a href="?show=tv1201012151800" class="programmelink" id="link_tv11800">Kuuden Tv-uutiset ja sää</a> ! # </div><br /> ! # <div class="desc" id="desc_tv11800"> ! # <span class="desc_title">Kuuden Tv-uutiset ja sää</span> ! # <span class="desc_time"> ! # YLE TV1 18.00 - ! # 18.30 ! # </span> ! # Mukana talous kulttuuri ja urheilu.<br /> ! # <a ...</a> ! # </div> ! # </div> # ! # - first entry always starts on $today ! # - last entry always ends on $tomorrow ! # - the end time in "desc_time" is unfortunately unreliable and leads to ! # overlapping programme entries. We only use it for the last entry. # ! my $opaque = startProgrammeList($id, $code); ! if (my @programmes = $root->look_down("class" => qr/^programme\s+/)) { ! my($last_hour, $last_minute); ! ! foreach my $programme (@programmes) { ! my $start = $programme->look_down("class", "start"); ! my $title = $programme->look_down("class", "programmelink"); ! my $desc = $programme->look_down("class", "desc"); ! my $time = $programme->look_down("class", "desc_time"); ! if ($start && $title && $desc && $time) { ! $start = join("", $start->content_list()); ! $title = join("", $title->content_list()); ! $time = join("", $time->content_list()); ! # Extract text elements from desc (why is this so complicated?) ! $desc = join("", grep { not ref($_) } $desc->content_list()); ! $desc =~ s/^\s+//; ! $desc =~ s/\s+$//; ! # Sanity checks ! if ((my($hour, $minute) = ($start =~ /^(\d{2})\.(\d{2})/)) && ! (($last_hour, $last_minute) = ! ($time =~ /\d{2}\.\d{2}\s+-\s+(\d{2})\.(\d{2})/)) && ! length($title)) { ! my($category) = $programme->attr("class") =~ /cat(\d+)/ ! if defined $map; ! $category = $map->{$category} ! if defined $category; ! debug(3, "List entry $channel ($hour:$minute) $title"); ! debug(4, $category) if defined $category; ! debug(4, $desc); ! # Add programme ! my $object = appendProgramme($opaque, $hour, $minute, $title); ! $object->category($category); ! $object->description($desc); } } } - - # Add dummy entry to define stop time for last entry - # Check for special case "24:00" - appendProgramme($opaque, $last_hour == 24 ? 0 : $last_hour, - $last_minute, "DUMMY") - if defined $last_hour; } --- 78,158 ---- # Get channel number from XMLTV id ! return unless my($channel, $code) = ($id =~ /^([^.]+)\.([^.]+)\.yle\.fi$/); ! # Fetch & parse HTML (do not ignore HTML5 <time>) ! my $root = fetchTree("http://ohjelmaopas.yle.fi/tv/lang/$code?previousUrl=/tv/opas&t=" . $today->ymdd(), ! undef, undef, 1); if ($root) { ! my @objects; ! my $date = Date::Manip::Date->new(); # ! # Each programme can be found in a separate <li> node # ! # <ul class="channel-lists ..."> ! # <li> ! # <h1 id="yle-tv1">Yle TV1...</h1> ! # <ul> ! # <li class="program-entry ..."> ! # <div class="program-label"> ! # <time class="dtstart" datetime="2014-06-15T01:30:00.000+03:00">01:30</time> ! # <time class="dtend" datetime="2014-06-15T04:30:00.000+03:00"></time> ! # <div class="program-title"> ! # ... ! # <a class="link-grey" href="...">Suunnistuksen Jukolan viesti</a> ! # ... ! # </div> ! # </div> ! # ... ! # <div class="program-desc"> ! # <p>66. Jukolan viesti. Kolmas, neljäs ja viides osuus... ! # ... ! # </p> ! # </div> ! # </li> ! # ... ! # </ul> ! # ... ! # </li> ! # </ul> # ! if (my $container = $root->look_down("class" => qr/^channel-lists\s+/)) { ! if (my $header = $container->look_down("_tag" => "h1", ! "id" => $channel)) { ! if (my $parent = $header->parent()) { ! if (my @programmes = $parent->look_down("class" => qr/^program-entry\s+/)) { ! foreach my $programme (@programmes) { ! my $start = $programme->look_down("class", "dtstart"); ! my $end = $programme->look_down("class", "dtend"); ! my $title = $programme->look_down("class", "program-title"); ! my $desc = $programme->look_down("class", "program-desc"); ! if ($start && $end && $title && $desc) { ! $start = $date->parse($start->attr("datetime")) ? undef : $date->secs_since_1970_GMT(); ! $end = $date->parse($end->attr("datetime")) ? undef : $date->secs_since_1970_GMT(); ! $title = $title->as_text(); ! $desc = $desc->find("p"); ! $title =~ s/^\s+//; ! $title =~ s/\s+$//; ! if ($start && $end && length($title)) { ! $desc = $desc ? $desc->as_text() : ""; ! $desc =~ s/^\s+//; ! $desc =~ s/\s+$//; ! debug(3, "List entry $channel ($start -> $end) $title"); ! debug(4, $desc) if $desc; ! # Create program object ! my $object = fi::programme->new($id, $code, $title, $start, $end); ! $object->description($desc); ! push(@objects, $object); ! } ! } ! } } } } } *************** *** 219,225 **** $root->delete(); ! # Convert list to program objects ! # First entry always starts $today -> don't use $yesterday ! return(convertProgrammeList($opaque, undef, $today, $tomorrow)); } --- 160,167 ---- $root->delete(); ! # Fix overlapping programmes ! fi::programme->fixOverlaps(\@objects); ! ! return(\@objects); } Index: tvhs.pm =================================================================== RCS file: /cvsroot/xmltv/xmltv/grab/fi/fi/source/tvhs.pm,v retrieving revision 2.06 retrieving revision 2.07 diff -C2 -d -r2.06 -r2.07 *** tvhs.pm 1 Nov 2013 22:34:23 -0000 2.06 --- tvhs.pm 14 Jun 2014 18:18:36 -0000 2.07 *************** *** 137,143 **** # Fetch & parse HTML ! my $root = fetchTree("http://tv.hs.fi/home/grid?group=${group}&date=" . ! sprintf("%04d-%02d-%02d", ! $today->year(), $today->month(), $today->day())); if ($root) { my @objects; --- 137,141 ---- # Fetch & parse HTML ! my $root = fetchTree("http://tv.hs.fi/home/grid?group=${group}&date=" . $today->ymdd()); if ($root) { my @objects; |