#!/usr/bin/perl -w eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}' if 0; # not running under some shell =pod =encoding utf8 =head1 NAME tv_grab_is - Grab TV listings for Iceland. =head1 SYNOPSIS tv_grab_is --help tv_grab_is [--config-file FILE] --configure [--gui OPTION] tv_grab_is [--config-file FILE] [--output FILE] [--days N] [--offset N] [--quiet] tv_grab_is --capabilities tv_grab_is --version =head1 DESCRIPTION Output TV listings for several channels available in Iceland. First run B to choose, which channels you want to download. Then running B with no arguments will output listings in XML format to standard output. B<--configure> Prompt for which channels, and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_is.conf>. This is the file written by B<--configure> and read when grabbing. B<--gui OPTION> Use this option to enable a graphical interface to be used. OPTION may be 'Tk', or left blank for the best available choice. Additional allowed values of OPTION are 'Term' for normal terminal output (default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar. B<--output FILE> Write to FILE rather than standard output. B<--days N> Grab N days. The default is as many as the source carries. B<--offset N> Start N days in the future. The default is to start from today. B<--quiet> Suppress the progress messages normally written to standard error. B<--capabilities> Show which capabilities the grabber supports. For more information, see L B<--version> Show the version of the grabber. B<--help> Print a help message and exit. =head1 SEE ALSO L. =head1 AUTHOR Yngvi Þór Sigurjónsson (yngvi@teymi.is). Heavily based on tv_grab_dk by Jesper Skov (jskov@zoftcorp.dk). tv_grab_dk originally based on tv_grab_nl by Guido Diepen and Ed Avis (ed@membled.com) and tv_grab_fi by Matti Airas. Version 1.1, Eggert Thorlacius (eggert@thorlacius.com). Started out by replacing a couple of channels with XML feeds, but ended up by removing the sjonvarp.is code completely. Left in the "xxx.sjonvarp.is" XMLTV IDs for backwards compatibility. Version 1.4 Hafþór Hilmarsson ( haffi@haffi.is). Current code did not work since it was missing end tags for some elements. Works now with Dispatcharr tested. =head1 BUGS =cut use strict; use XMLTV; use XMLTV::Version "$XMLTV::VERSION"; use XMLTV::Capabilities qw/baseline manualconfig cache/; use XMLTV::Description 'Iceland'; use Getopt::Long; use HTML::TreeBuilder; use HTML::Entities; # parse entities use IO::File; use URI; use Encode qw(encode decode FB_CROAK); use utf8; # source code is encoded in utf8 use Date::Manip; use XML::LibXSLT; use XML::DOM; use XMLTV::Memoize; use XMLTV::ProgressBar; use XMLTV::Ask; use XMLTV::Mode; use XMLTV::Config_file; use XMLTV::DST; use XMLTV::Get_nice; use XMLTV::Supplement qw/GetSupplement/; use XMLTV::Date; use XMLTV::Usage <= 6) { Date::Manip::Date_Init("SetDate=now,UTC"); } else { Date::Manip::Date_Init("TZ=UTC"); } } # default language my $LANG = 'is'; sub basechid($); sub ispluschannel($); sub process_xml_channel( $$$$$$ ); sub process_ruv_is( $$$$$$ ); sub get_categories_map(); sub normalize_to_text($); # Get options XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); my ($opt_days, $opt_offset, $opt_help, $opt_output, $opt_configure, $opt_config_file, $opt_gui, $opt_quiet, $opt_list_channels); $opt_days = 7; # default $opt_offset = 0; # default GetOptions('days=i' => \$opt_days, 'offset=i' => \$opt_offset, 'help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, 'gui:s' => \$opt_gui, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, 'list-channels' => \$opt_list_channels, ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); usage(1) if $opt_help; XMLTV::Ask::init($opt_gui); my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', $opt_list_channels => 'list-channels', ); # File that stores which channels to download. my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_is', $opt_quiet); if ($mode eq 'configure') { XMLTV::Config_file::check_no_overwrite($config_file); open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; # find list of available channels my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my %channels = get_channels(); die 'no channels could be found' if (scalar(keys(%channels)) == 0); update $bar if not $opt_quiet; $bar->finish() if not $opt_quiet; my @chs = sort keys %channels; my @names = map { $channels{$_} } @chs; my @qs = map { "add channel $_?" } @names; my @want = ask_many_boolean(1, @qs); foreach (@chs) { my $w = shift @want; warn("cannot read input, stopping channel questions"), last if not defined $w; # No need to print to user - XMLTV::Ask is verbose enough. # Print a config line, but comment it out if channel not wanted. print CONF '#' if not $w; my $name = shift @names; print CONF "channel $_ $name\n"; # TODO don't store display-name in config file. } close CONF or warn "cannot close $config_file: $!"; say("Finished configuration."); exit(); } # Not configuring, we will need to write some output. die if $mode ne 'grab' and $mode ne 'list-channels'; # If we are grabbing, check we can read the config file before doing # anything else. # my @config_lines; if ($mode eq 'grab') { @config_lines = XMLTV::Config_file::read_lines($config_file); } my %w_args; if (defined $opt_output) { my $fh = new IO::File(">$opt_output"); die "cannot write to $opt_output: $!" if not defined $fh; $w_args{OUTPUT} = $fh; } $w_args{encoding} = 'UTF-8'; $w_args{UNSAFE} = 1; # Needed by process_xml_channel my $writer = new XMLTV::Writer(%w_args); # TODO: standardize these things between grabbers. $writer->start ({ 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://xmltv.org/', }); if ($mode eq 'list-channels') { my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my %channels = get_channels(); die 'no channels could be found' if (scalar(keys(%channels)) == 0); update $bar if not $opt_quiet; foreach my $ch_did (sort(keys %channels)) { my $ch_name = $channels{$ch_did}; my $display_name = normalize_to_text($ch_name); my $ch_xid = "$ch_did.sjonvarp.is"; $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $display_name ] ], 'icon' => [{'src' => get_icon($ch_did)}] }); } $bar->finish() if not $opt_quiet; $writer->end(); exit(); } # Not configuring or writing channels, must be grabbing listings. die if $mode ne 'grab'; my (%channels, @channels, $ch_did, $ch_name); my $line_num = 1; foreach (@config_lines) { ++ $line_num; next if not defined; # FIXME channel data should be read from the site, and then the # config file only gives the XMLTV ids that are interesting. # # Using internal list of channels now to get correctly encoded # channel names. Config file format remains unchanged for compatibility # if (/^channel:?\s+(\S+)\s+([^\#]+)/) { $ch_did = $1; push @channels, $ch_did; } else { warn "$config_file:$.: bad line\n"; } } %channels = get_channels(); ###################################################################### # begin main program # Fetch map of category number to names my %categories; get_categories_map(); my $now = parse_date('now'); die if not defined $now; # Declare array of { date, num_days, channel_id, channel_xid, function } # and populate it with all pages we need to get. my @to_get; my $startday = UnixDate(DateCalc(UnixDate($now, '%Y-%m-%d'), "+ $opt_offset days"), '%Y-%m-%d'); die if not defined $startday; foreach $ch_did (@channels) { $ch_name = $channels{$ch_did}; my $display_name = normalize_to_text($ch_name); my $ch_xid = "$ch_did.sjonvarp.is"; $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $display_name ] ], 'icon' => [{'src' => get_icon($ch_did)}] }); my $timeoffset = ispluschannel($ch_did) ? 1 : 0; # RUV is the only supported source at the moment. push @to_get, [ $startday, $opt_days, $ch_did, $ch_xid, $timeoffset, \&process_ruv_is ]; } my $bar = new XMLTV::ProgressBar('fetching data', scalar @to_get) if not $opt_quiet; foreach (@to_get) { my ($date, $num_days, $ch_tvgids_id, $ch_xmltv_id, $timeoffset, $func) = @$_; &{$func}($writer, $date, $num_days, $ch_tvgids_id, $timeoffset, $ch_xmltv_id); update $bar if not $opt_quiet; } $bar->finish() if not $opt_quiet; $writer->end(); ###################################################################### # subroutine definitions # Attepmts to parse director from a string. If successful, director # is removed from original string and returned sub get_director($) { my $director =""; if($_[0] =~ s/Leikstjóri:\s*([^.]*)\.// ) { $director = $2; } return $director; } # Attepmts to parse actors from a string. If successful, actor list # is removed from original string and returned as an array sub get_actors($) { my $actors; if($_[0] =~ s/\s*(Aðalhlutverk:|[mM]eðal leikenda eru|Aðalhlutverk leika|[Íí] aðalhutverkum eru|[Ll]eikendur eru)\s*([^.]*)\.// ) { my @a = split(/, | og /, $2); s/[.]$// foreach @a; push @$actors, @a; } return $actors; } # Cuts "plus." of xmlhannelID if appropriate sub basechid($) { my ( $id ) = @_; if ($id =~ /^plus./) { return substr($id, 5); } return $id; } sub ispluschannel($) { return substr($_[0], 0, 5) eq "plus." } # Takes XML and XSL transform that converts it to a list of # elements, and writes the elements to the XMLTV writer (after massaging them # a bit) sub process_xml_channel( $$$$$$ ) { my ($writer, $infromdate, $num_days, $timeoffset, $xsl_transform, $xml) = @_; my $fromdate = ParseDate($infromdate); my $enddate = ParseDate(UnixDate(DateCalc($fromdate, "+ " . ($num_days) . " days"), '%Y-%m-%d')); # Step one: Use XSLT to munge XML to XMLTV format... my $parser = XML::LibXML->new(); die unless defined $parser; my $xslt = XML::LibXSLT->new(); die unless defined $xslt; my $stylesheet = $xslt->parse_stylesheet($parser->parse_string($xsl_transform)); # suppress warning message caused by using Perl var rather than a physical document() my $results; { local $SIG{__WARN__} = sub { warn @_ unless (defined $_[0] && $_[0] =~ /^I\/O warning : failed to load external entity "unknown-/); }; $results = $stylesheet->transform($parser->parse_string($xml)); } # Step 2: Loop through all programmes and all their children. # Do some processing on the children and then dump the programme # into the writer. my @programmeElem = $results->getDocumentElement->getElementsByTagName('programme'); # Build a date-sorted list so we can derive each programme stop from the # following programme start. my @programmes; foreach my $programme (@programmeElem) { my $start = ParseDate($programme->getAttribute('start')); next unless defined $start; push @programmes, { node => $programme, start => $start }; } @programmes = sort { Date_Cmp($a->{start}, $b->{start}) } @programmes; for (my $idx = 0; $idx <= $#programmes; $idx++) { my $programme = $programmes[$idx]{node}; my $myDate = $programmes[$idx]{start}; if ((Date_Cmp($myDate, $fromdate) >= 0) && (Date_Cmp($myDate, $enddate) < 0)) { my $startDate = DateCalc($myDate, "+ " . $timeoffset . " hours"); # Add one hour if this is a plus channel $programme->setAttribute('start', UnixDate($startDate,'%q') . " +0000"); my $nextDate; for (my $next_idx = $idx + 1; $next_idx <= $#programmes; $next_idx++) { if (Date_Cmp($programmes[$next_idx]{start}, $myDate) > 0) { $nextDate = $programmes[$next_idx]{start}; last; } } my $stopDate; if (defined $nextDate) { $stopDate = DateCalc($nextDate, "+ " . $timeoffset . " hours"); } else { # Ensure we always emit a stop time, even for the last item. $stopDate = DateCalc($startDate, "+ 1 hour"); } $programme->setAttribute('stop', UnixDate($stopDate,'%q') . " +0000"); foreach my $node ($programme->childNodes) { if ($node->nodeName eq 'title' && $node->hasChildNodes()) { my $titleStr = $node->firstChild->data; $titleStr =~ s/ - N.TT$//; # Strip garbage that no one cares about $titleStr =~ s/ - Loka..ttur$//; my $episode_num="$1:$2" if ($titleStr =~ s/\s*\((\d+):(\d+)\)//); if ($episode_num) { my $episodeNode = XML::LibXML::Element->new('episode-num'); $episodeNode->appendTextNode($episode_num); $programme->appendChild($episodeNode); } $node->firstChild->setData($titleStr); } elsif ($node->nodeName eq 'sub-title' && ( !$node->hasChildNodes() || ($node->firstChild->data =~ /^[\n ]*$/))) { $programme->removeChild($node); } elsif ($node->nodeName eq 'desc') { if ($node->hasChildNodes() && ($node->firstChild->data !~ /^[\n ]*$/)) { my $credits = $programme->find("credits")->get_node(0); if( !defined( $credits ) ) { $credits = XML::LibXML::Element->new( 'credits' ); $programme->insertAfter( $credits, $node ); } # If this is the description, extract director and actors my $director = get_director($node->firstChild->data); if (defined $director && $director ne "") { my $dirNode = XML::LibXML::Element->new('director'); $dirNode->appendTextNode($director); $credits->appendChild($dirNode); } my $actors = get_actors($node->firstChild->data); foreach my $actor (@$actors) { my $actNode = XML::LibXML::Element->new('actor'); $actNode->appendTextNode($actor); $credits->appendChild($actNode); } } else { $programme->removeChild($node); } } elsif ($node->nodeName eq 'category') { # FIXME where does the empty category element come from? # [bilbo] sometimes (for no determinable reason) the document lookup is failing - maybe because it's # a Perl var and not a real doc, for some reason...? It fails to load the document() (clue = no "I/O warning" in this case) # Run it again and it's fine. Or not! #if (!$node->hasChildNodes()) { # $programme->removeChild($node); #} # That's just too flaky so let's go "old school" for now (otherwise nightly validator fails) # print STDERR "was: ".$node."\n"; ## Now done as a Supplement as of v1.30 ## my %categories = ( '1'=>'children', '2'=>'series', '3'=>'news', '4'=>'educational', '5'=>'sports', '6'=>'misc', '7'=>'movie', '8'=>'culture', '9'=>'music', '11'=>'entertainment', '13'=>'news magazine' ); # output the category name from the Supplement, else the category numeric value my $category = $categories{$node->firstChild->data} || $node->firstChild->data; $node->removeChildNodes(); $node->appendText($category); # print STDERR "now: ".$node."\n"; } } my $programme_xml = encode('UTF-8', $programme->toString . "\n"); $writer->raw($programme_xml); } } } sub process_ruv_is ( $$$$$$ ){ my ($writer, $fromdate, $num_days, $tv2chan, $timeoffset, $ch_xmltv_id) = @_; my $xsl_transform = <<"EOF"; children series news educational sports misc movie culture music entertainment news magazine <xsl:apply-templates select="title"/> EOF # [bilbo] following code removed in v1.30, ... # # # # # # # # # # <!-- category is no good: --> # # ..and replaced with: $xsl_transform .= <<"EOF"; EOF $xsl_transform .= <<"EOF"; : EOF # Get the XML. Note that due to some edge cases, we must get data for the day before # and after the ones we're looking for and let process_xml_cannel remove most of the # data. If we don't, tv_validate_grabber will fail my $dayBefore = UnixDate(DateCalc($fromdate, "- 1 days"), '%Y-%m-%d'); my $dayAfter = UnixDate(DateCalc($fromdate, "+ " . ($num_days + 1) . " days"), '%Y-%m-%d'); my $url = "http://muninn.ruv.is/files/xml/sjonvarpid/$dayBefore/$dayAfter/"; my $xml = get_nice( $url ); die( "Failed to fetch $url" ) unless defined( $xml ); process_xml_channel($writer, $fromdate, $num_days, $timeoffset, $xsl_transform, $xml); } # get channel listing sub get_channels { return ( "RUV" => "Rikissjonvarpid", ); } # Icon URL for a given channel. sub get_icon { my ($ch_did) = @_; $ch_did = basechid($ch_did); if ($ch_did eq "RUV") { return "http://haffi.is/ico/ruv.jpg"; } return ""; } # Fetch the map of category numbers from the Supplement server sub get_categories_map () { my $supplement = GetSupplement('tv_grab_is', 'category_map'); my @lines = split /[\r\n]+/, $supplement; foreach my $line (@lines) { chomp $line; chop($line) if ($line =~ m/\r$/); trim($line); next if $line =~ /^#/ || $line eq ''; my ($cat_num, $cat_name, $trash) = $line =~ /^(.*)==(.*?)([\s\t]*#.*)?$/; $categories{$cat_num} = $cat_name; } #use Data::Dumper; print Dumper (\%categorymap); } sub trim { # Remove leading & trailing spaces $_[0] =~ s/^\s+|\s+$//g; } # Normalize potentially mixed-encoded text to Perl characters so the writer can # output valid UTF-8 consistently. sub normalize_to_text($) { my ($text) = @_; return '' if not defined $text; return $text if utf8::is_utf8($text); my $decoded; eval { $decoded = decode('UTF-8', $text, FB_CROAK); }; if ($@) { $decoded = decode('ISO-8859-1', $text); } return $decoded; }