From: Ed A. <ep...@us...> - 2005-01-23 21:39:37
|
Update of /cvsroot/xmltv/xmltv/grab/pt In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12195/grab/pt Modified Files: tv_grab_pt Log Message: Bruno's new tv_grab_pt fetching from a different website. This is based on the version he sent me, plus the diff I cc'd to the mailing list, then a few changes to put back --gui and for detecting old config files. Index: tv_grab_pt =================================================================== RCS file: /cvsroot/xmltv/xmltv/grab/pt/tv_grab_pt,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** tv_grab_pt 7 Sep 2004 18:41:40 -0000 1.8 --- tv_grab_pt 23 Jan 2005 21:39:27 -0000 1.9 *************** *** 13,18 **** tv_grab_pt [--config-file FILE] --configure [--gui OPTION] ! tv_grab_pt [--config-file FILE] [--output FILE] [--days N] ! [--offset N] [--quiet] tv_grab_pt --list-channels --- 13,17 ---- tv_grab_pt [--config-file FILE] --configure [--gui OPTION] ! tv_grab_pt [--config-file FILE] [--output FILE] [--quiet] tv_grab_pt --list-channels *************** *** 41,49 **** B<--output FILE> write to FILE rather than standard output. - B<--days N> grab N days. The default is 3. - - B<--offset N> start N days in the future. The default is to start - from today. - B<--quiet> suppress the progress messages normally written to standard error. --- 40,43 ---- *************** *** 75,78 **** --- 69,74 ---- use HTML::Entities; # parse entities use IO::File; + use File::Path; + use File::Basename; use XMLTV; *************** *** 88,93 **** $0: get Portuguese television listings in XMLTV format To configure: $0 --configure [--config-file FILE] [--gui OPTION] ! To grab listings: $0 [--config-file FILE] [--output FILE] [--days N] ! [--offset N] [--quiet] To list channels: $0 --list-channels END --- 84,88 ---- $0: get Portuguese television listings in XMLTV format To configure: $0 --configure [--config-file FILE] [--gui OPTION] ! To grab listings: $0 [--config-file FILE] [--output FILE] [--quiet] To list channels: $0 --list-channels END *************** *** 95,100 **** # Attributes of the root element in output. ! my $HEAD = { 'source-info-url' => 'http://tv.clix.pt', ! 'source-data-url' => "http://tv.clix.pt", 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://membled.com/work/apps/xmltv/', --- 90,95 ---- # Attributes of the root element in output. ! my $HEAD = { 'source-info-url' => 'http://tvcabo.pt', ! 'source-data-url' => "http://tvcabo.pt/tv_cabo_progr_canal.asp", 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://membled.com/work/apps/xmltv/', *************** *** 112,139 **** # Get options, including undocumented --cache option. XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); ! my ($opt_days, $opt_offset, $opt_help, $opt_output, $opt_configure, $opt_config_file, $opt_gui, $opt_quiet, $opt_list_channels); - $opt_days = 2; # default - $opt_offset = 0; # default $opt_quiet = 0; # default ! GetOptions('days=i' => \$opt_days, ! 'offset=i' => \$opt_offset, ! 'help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, ! 'gui:s' => \$opt_gui, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, ! 'list-channels' => \$opt_list_channels ) or usage(0); - die 'number of days must not be negative' - if (defined $opt_days && $opt_days < 0); - warn "site normally has only two days of listings\n" - if $opt_days + $opt_offset > 2; usage(1) if $opt_help; XMLTV::Ask::init($opt_gui); my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', --- 107,137 ---- # Get options, including undocumented --cache option. XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); ! my ($opt_help, $opt_output, $opt_configure, $opt_config_file, $opt_gui, $opt_quiet, $opt_list_channels); $opt_quiet = 0; # default ! GetOptions('help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, ! 'gui:s' => \$opt_gui, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, ! 'list-channels' => \$opt_list_channels, ! 'offset=i' => \ my $opt_offset, # ignored ! 'days=i' => \ my $opt_days, # ignored ) or usage(0); usage(1) if $opt_help; XMLTV::Ask::init($opt_gui); + # --offset and --days are ignored (we return more data than was + # requested) but at least check the user didn't ask for something + # impossible. + # + my $first_day = ($opt_offset || 0) + ($opt_days || 0); + die 'cannot grab more than one week ahead' if $first_day >= 7; + warn "ignoring --offset\n" if defined $opt_offset; + warn "ignoring --days\n" if defined $opt_days; + my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', *************** *** 148,151 **** --- 146,150 ---- if ($mode eq 'configure') { XMLTV::Config_file::check_no_overwrite($config_file); + mkpath(dirname($config_file)); } elsif ($mode eq 'grab') { *************** *** 181,186 **** print CONF '#' if not $w; my $name = shift @names; ! print CONF "channel $_\n"; ! # TODO don't store display-name in config file. } --- 180,184 ---- print CONF '#' if not $w; my $name = shift @names; ! print CONF "channel $_.tvcabo.pt\n"; } *************** *** 205,212 **** } $w_args{encoding} = 'ISO-8859-1'; ! my $writer = new XMLTV::Writer(%w_args); ! $writer->start($HEAD); if ($mode eq 'list-channels') { $writer->write_channel($_) foreach @ch_all; $writer->end(); --- 203,211 ---- } $w_args{encoding} = 'ISO-8859-1'; ! my $writer; ! sub start_writing() { ($writer = new XMLTV::Writer(%w_args))->start($HEAD) } if ($mode eq 'list-channels') { + start_writing; $writer->write_channel($_) foreach @ch_all; $writer->end(); *************** *** 223,230 **** ++ $line_num; next if not defined; ! if (/^channel:?\s+(\S+)(s+)?/) { my $ch_did = $1; push @channels, $ch_did; } else { warn "$config_file:$line_num: bad line\n"; --- 222,242 ---- ++ $line_num; next if not defined; ! ! # For now, check that tvcabo.pt appears on every line. This ! # ensures we don't have a config file left over from the old ! # grabber. ! # ! if (/^channel:?\s+(\d+).tvcabo.pt\s*$/) { my $ch_did = $1; + die if not defined $ch_did; push @channels, $ch_did; } + elsif (/^channel/) { + die <<END + The configuration file is left over from the old tv_grab_pt. The new + site uses different channels so you need to reconfigure the grabber. + END + ; + } else { warn "$config_file:$line_num: bad line\n"; *************** *** 235,240 **** # begin main program # Assume the listings source uses CET (see BUGS above). - my $now = DateCalc(ParseDate('now'), "$opt_offset days"); die "No channels specified, run me with --configure\n" if not keys %channels; --- 247,253 ---- # begin main program + start_writing; + # Assume the listings source uses CET (see BUGS above). die "No channels specified, run me with --configure\n" if not keys %channels; *************** *** 245,276 **** # they could be separate. # my $bar = new XMLTV::ProgressBar('getting listings', ! scalar(@channels) * $opt_days) if not $opt_quiet; foreach my $ch_did (@channels) { my $ch_name=$channels{$ch_did}; ! $writer->write_channel({ id => $ch_did, 'display-name' => [ [ $ch_name ] ] }); } ! my $date = UnixDate($now,'%Q'); ! for (my $i = 0; $i < $opt_days; $i++) { ! my $some = 0; ! foreach my $ch_did (@channels) { ! foreach (process_table($ch_did, $date)) { $writer->write_programme($_); $some = 1; } update $bar if $bar; - } - if (not $some) { - die "no programmes found\n" if $i == 0; - warn "only one day of listings found\n" if $i == 1; - warn "only $i days of listings found\n" if $i > 1; - last; - } - $date = nextday($date); die if not defined $date; } ! $bar->finish() if $bar; $writer->end(); --- 258,285 ---- # they could be separate. # + my $bar = new XMLTV::ProgressBar('getting listings', ! scalar @channels) if not $opt_quiet; foreach my $ch_did (@channels) { + die if not defined $ch_did; my $ch_name=$channels{$ch_did}; ! $writer->write_channel({ id => $ch_did.'.tvcabo.pt', 'display-name' => [ [ $ch_name ] ] }); } ! ! my $some=0; ! foreach my $ch_did (@channels) { ! foreach (process_table($ch_did)) { $writer->write_programme($_); $some = 1; } update $bar if $bar; } ! if (not $some) { ! die "no programmes found\n" unless $some; ! } ! $writer->end(); *************** *** 293,297 **** # Clean up bad characters in HTML. - my $warned_bad_chars; sub tidy( $ ) { for (my $s = shift) { --- 302,305 ---- *************** *** 300,306 **** # s/\s\226\s/ vs /g; - if (tr/\012\015\040-\176\240-\377//dc) { - warn 'removing bad characters' unless $warned_bad_chars++; - } return $_; } --- 308,311 ---- *************** *** 308,318 **** sub process_table { ! my ($ch_xmltv_id, $date) = @_; ! t "Getting channel $ch_xmltv_id, date $date\n"; ! die unless $date =~ /(\d{4})(\d{2})(\d{2})/; ! my $my_date = "$1-$2-$3"; ! my $url = $HEAD->{'source-info-url'}."/canais.html?dia=$my_date&channel=$channels{$ch_xmltv_id}"; #print STDERR "Getting url : $url"; t $url; --- 313,321 ---- sub process_table { ! my ($ch_xmltv_id) = @_; ! t "Getting channel $ch_xmltv_id\n"; ! my $url = $HEAD->{'source-data-url'}."?identificadorCanal=$ch_xmltv_id&l1.x=1&l1.y=2"; #print STDERR "Getting url : $url"; t $url; *************** *** 337,341 **** my @r; foreach my $p (@program_data) { ! push @r, make_programme_hash($ch_xmltv_id, $p, $first, $date); } return @r; --- 340,344 ---- my @r; foreach my $p (@program_data) { ! push @r, make_programme_hash($ch_xmltv_id, $p, $first); } return @r; *************** *** 343,357 **** sub make_programme_hash { ! my ($ch_xmltv_id, $cur, $first, $date) = @_; my %prog; ! $prog{channel}=$ch_xmltv_id; ! $prog{title}=[ [ $cur->{title}, $LANG ] ]; ! $prog{"sub-title"}=[ [ $cur->{subtitle}, $LANG ] ] if $cur->{subtitle}; ! $prog{category}=[ [ $cur->{category}, $LANG ] ] if $cur->{category}; ! if ( $cur->{time} < $first->{time} ) { t "Jumping for next day of (".$cur->{time}.",".$first->{time}.") $date..."; $date = nextday($date); --- 346,363 ---- sub make_programme_hash { ! my ($ch_xmltv_id, $cur, $first) = @_; ! ! my $date = $cur->{date}; my %prog; ! $prog{channel} =$ch_xmltv_id.'.tvcabo.pt'; ! $prog{title} =[ [ $cur->{title}, $LANG ] ]; ! $prog{"sub-title"} =[ [ $cur->{subtitle}, $LANG ] ] if $cur->{subtitle}; ! $prog{category} =[ [ $cur->{category}, $LANG ] ] if $cur->{category}; ! $prog{"episode-num"} = $cur->{"episode-num"} if $cur->{"episode-num"}; ! if ( ($cur->{time} < $first->{time}) && ($cur->{date} == $first->{date}) ) { t "Jumping for next day of (".$cur->{time}.",".$first->{time}.") $date..."; $date = nextday($date); *************** *** 376,379 **** --- 382,395 ---- + # as_trimmed_text() doesn't deal with ASCII 160, non-breaking space. + sub trim( $ ) { + for (my $tmp = shift) { + tr/\240/ /; + s/^\s+//; + s/\s+$//; + return $_; + } + } + # sub get_program_data { *************** *** 382,419 **** my @data; my @tables = $tree->find_by_tag_name("_tag"=>"table"); # Actually time and title are required, but we don't check that. foreach my $table (@tables) { - my @trs = $table->find_by_tag_name("_tag"=>"tr"); next unless $trs[2]; ! my $tr = $trs[3]; ! my @tds = $tr->find_by_tag_name("_tag"=>"td"); ! next unless (scalar(@tds) >= 2); ! my $should_be_hour = $tds[0]->as_trimmed_text."\n"; ! #print STDERR "*".$should_be_hour."*\n"; ! next unless ($should_be_hour =~ /^Hora/); ! #print STDERR "Found the leading html\n"; ! my $index = 4; while ($trs[$index]) { my @tds = $trs[$index]->find_by_tag_name("_tag"=>"td"); ! my $time = $tds[0]->as_trimmed_text; ! my $title = $tds[1]->as_trimmed_text; ! my $cat = $tds[2]->as_trimmed_text; - #print STDERR "Found $time | $title | $cat\n"; $time =~ s/://g; my %h = ( time => $time, - category=> $cat, title=> $title, ! subtitle=> "", desc => ""); push @data, \%h; $index = $index + 1; } - last; } return @data; --- 398,481 ---- my @data; + my %month_conv = ( + 'Jan' => 'Jan', + 'Fev' => 'Feb', + 'Mar' => 'Mar', + 'Abr' => 'Apr', + 'Mai' => 'May', + 'Jun' => 'Jun', + 'Jul' => 'Jul', + 'Ago' => 'Aug', + 'Set' => 'Sep', + 'Out' => 'Oct', + 'Nov' => 'Nov', + 'Dez' => 'Dec', + ); + my @tables = $tree->find_by_tag_name("_tag"=>"table"); + # Actually time and title are required, but we don't check that. + my $table_n = 0; foreach my $table (@tables) { my @trs = $table->find_by_tag_name("_tag"=>"tr"); next unless $trs[2]; ! my @tds = $trs[2]->find_by_tag_name("_tag"=>"td"); ! next unless (scalar(@tds) == 1); ! my $should_be_hour = trim($tds[0]->as_trimmed_text())."\n"; ! next unless ($should_be_hour =~ /.* - (\d+) de (.*) de (\d+)/); ! #print STDERR "\n*".$should_be_hour."*\n"; ! my ($day, $month, $year) = ($1, $2, $3); ! $month =~ s/^(\w{3}).*$/$1/; ! $month = $month_conv{$month}; ! ! #print STDERR "Parsed date to $day, $month, $year -> ".ParseDate("$day $month $year")."\n"; ! my $parsed_date = ParseDate(sprintf("%02d %3s %04d",$day, $month, $year)); ! die "Parse error, could not parse date" unless $parsed_date; ! my $f_date = $parsed_date; ! $f_date =~ s/^(\d{4})(\d{2})(\d{2}).*/$1$2$3/; ! #print "Final date found : $f_date\n"; ! #print "--- $f_date matched\n"; ! my $index = 3; while ($trs[$index]) { my @tds = $trs[$index]->find_by_tag_name("_tag"=>"td"); ! last unless( $tds[0] && $tds[1] ); ! my $time = trim($tds[0]->as_trimmed_text()); ! my $title = trim($tds[1]->as_trimmed_text()); ! my $sub_t = ""; ! my $episode = ""; ! ! if ( my $i = $tds[1]->find_by_tag_name("_tag"=>"i") ) { ! my $tmp = trim($i->as_trimmed_text()); ! $tmp =~ s/\(/\\\(/g; ! $tmp =~ s/\)/\\\)/g; ! $title =~ s/$tmp//; ! $tmp =~ s/^\s*?(.*?)\s*?/$1/; ! if ($tmp =~ /(Epis..io\s+\d+)/) { ! my $ep = $1; ! $tmp =~ s/$ep//; ! $ep =~ s/^Epis..io\s+//; ! $episode = ""; ! ! } ! if ($tmp) { ! $sub_t = $tmp; ! } ! } $time =~ s/://g; + #print STDERR "Found $time | $title\n"; + for ($title) { s/^\s+$//; s/\s+$// } my %h = ( time => $time, title=> $title, ! date => $f_date, ! "episode-num" => $episode, ! subtitle=> $sub_t, desc => ""); push @data, \%h; $index = $index + 1; } } return @data; *************** *** 425,429 **** if not $opt_quiet; my %channels; ! my $url=$HEAD->{'source-info-url'}; t $url; my $local_data=get_nice($url); --- 487,491 ---- if not $opt_quiet; my %channels; ! my $url=$HEAD->{'source-data-url'}; t $url; my $local_data=get_nice($url); *************** *** 434,451 **** $tree->parse($local_data); my @menus = $tree->find_by_tag_name("_tag"=>"select"); ! foreach my $elem (@menus) { my $cname = $elem->attr('name'); ! next unless $cname eq 'Channel'; my @ocanals = $elem->find_by_tag_name("_tag"=>"option"); @ocanals = sort @ocanals; foreach my $opt (@ocanals) { ! my $channel_id = $opt->content->[0]; ! $channel_id =~ s/\s*$//; ! $channel_id =~ s/\s/\_/g; ! my $channel_name= $opt->content->[0]; ! next if ($channel_id =~ /^[\-]+$/); ! next if ($channel_id =~ /^Canal$/); ! for ($channel_name) { s/^\s+//; s/\s+$// } $channels{$channel_id}=$channel_name; push @ch_all, { 'display-name' => [ [ $channel_name, --- 496,508 ---- $tree->parse($local_data); my @menus = $tree->find_by_tag_name("_tag"=>"select"); ! foreach my $elem (@menus) { my $cname = $elem->attr('name'); ! next unless $cname eq 'identificadorCanal'; my @ocanals = $elem->find_by_tag_name("_tag"=>"option"); @ocanals = sort @ocanals; foreach my $opt (@ocanals) { ! my $channel_id = $opt->attr('value'); ! my $channel_name= trim($opt->content->[0]); $channels{$channel_id}=$channel_name; push @ch_all, { 'display-name' => [ [ $channel_name, *************** *** 454,466 **** } #foreach } #while ! ! if (not %channels) { ! if ($local_data =~ /(P.gina tempor.riamente indisponivel)/) { ! die "$url says $1, cannot grab\n"; ! } ! die "no channels could be found in $url\n"; ! } update $bar if not $opt_quiet; - $bar->finish() if not $opt_quiet; return %channels; } --- 511,516 ---- } #foreach } #while ! die "no channels could be found" if not keys %channels; update $bar if not $opt_quiet; return %channels; } *************** *** 472,475 **** return UnixDate($n, '%Q'); } - - --- 522,523 ---- |