? new
? new2
? new3
? new3s
? new4
? new4s
? new5
? new5s
? new6
? new6s
? new7
? out
? patch
? test
? tv_grab_huro.cache
Index: tv_grab_huro
===================================================================
RCS file: /cvsroot/xmltv/xmltv/grab/huro/tv_grab_huro,v
retrieving revision 1.6
diff -u -5 -r1.6 tv_grab_huro
--- tv_grab_huro	19 Sep 2004 14:16:01 -0000	1.6
+++ tv_grab_huro	29 Nov 2004 19:37:21 -0000
@@ -322,11 +322,11 @@
 }
 
 # Make list of pages to fetch for each day.
 my @days;
 my $day=UnixDate($now,'%Q');
-for (my $i=1+$opt_offset;$i<$opt_days+$opt_offset+1;$i++) {
+for (my $i=1+$opt_offset;$i<$opt_days+$opt_offset+1;$i+=3) {
     push @days, [ $day, $i ];
     $day=nextday($day); die if not defined $day;
 }
 
 # This progress bar is for both downloading and parsing.  Maybe
@@ -393,11 +393,89 @@
     };
 
     # parse the page to a document object
     my $tree = HTML::TreeBuilder->new();
     $tree->parse($data);
-    my @program_data = get_program_data($tree);
+
+    my @datatables; 
+    # page consists of two main tables, split by an advertisement
+    # we need to reorder those tables, to grab continued column by column
+    # 
+    # actually we assign to @datatables like this:
+
+    # UPPER MAJOR TABLE:
+    #  0      10    20
+    #  1      11    21
+    #         12
+    #
+    #  <<< the ad >>>
+    # LOWER MAJOR TABLE:
+    #  5      15    25
+    #         16
+    #
+    
+    my $i = 0;
+    my $lasttime = 0;
+    # assign to @datatables in order: 0, 2, 4, 1, 3, 5, etc.
+    foreach my $tab($tree->look_down
+		    # "width"=>215 unfortunately isn't specified all the time
+		    ("_tag"=>"table", "cellspacing"=>2)) {
+	my $width = $tab->attr(qw(width));
+	next unless($width eq "100%" || $width == 215);
+
+	# time is printed in <strong /> tags, require those to skip the
+	# headings - which we don't really care for ...
+	next unless($tab->look_down("_tag"=>"strong"));
+
+
+	# especially on port.ro there aren't only two tables per day column,
+	# there are even more, split by images, etc.
+	#
+	# why the hell don't they continue the table and put the image
+	# right into a <tr><td> <img> </td></tr>  thingy??
+	#
+	# tsts...
+	#
+
+
+	# extract the first time specified in this table-piece ...
+	$tab->as_text() =~ m/([012][0-9]):([0-5][0-9])/
+	    or die "unable to parse returned html page";
+	my $time = $1 * 60 + $2;
+	$time += 24 * 60 if($time < 6 * 60 && ($i % 10 > 4));
+
+	#print "this: $time, last: $lasttime ...\n";
+
+	if($time < $lasttime) {
+	    # this table is in the same major table, but in the next column
+	    # since it's first time is before the last time of the prev. tab.
+	    $i = $i - ($i % 5) + 10;
+	}
+
+	if($time > 19 * 60 && ($i % 10 < 5)) {
+	    # first time time's after 19 o'clock => lower table
+	    $i = 5;
+	}
+
+
+	# lookup last time in this minor table ... (as base for comparing)
+	$tab->as_text() =~ m/.*([012][0-9]):([0-5][0-9])/
+	    or die "unable to parse returned html page";
+	$lasttime = $1 * 60 + $2;
+	$lasttime += 24 * 60 if($lasttime < 6 * 60 && ($i % 10 > 4));
+
+	
+	#print "assigning datatables entry ", $i, ".\n"; 
+	#$tab->dump();
+	$datatables[$i++] = $tab;
+    }
+
+    my @program_data;
+    foreach(@datatables) {
+	push @program_data, get_program_data($_)
+	    if defined;
+    }
 
     if (not @program_data) {
 	warn "no programs found, skipping\n";
 	return ();
     }
@@ -586,9 +664,9 @@
 
 # Bump a YYYYMMDD date by one.
 sub nextday( $ ) {
     my $d = shift;
     my $p = parse_date($d);
-    my $n = DateCalc($p, '+ 1 day');
+    my $n = DateCalc($p, '+ 3 day');
     return UnixDate($n, '%Q');
 }
 
