#!/usr/bin/perl -w # vi:noet:ts=4 #------------------------------------------------------------------------------- # documentation #------------------------------------------------------------------------------- =pod =head1 NAME tv_grab_huro - Grab TV listings for Hungary or Romania. =head1 SYNOPSIS tv_grab_huro --help tv_grab_huro [--config-file FILE] --configure [--gui GUITYPE] tv_grab_huro [--config-file FILE] [--output FILE] [--days N] [--offset N] [--slow] [--get-full-description] [--max-desc-length LENGTH] [--icons | (--local-icons DIRECTORY [--no-fetch-icons])] [--gui GUITYPE] [--quiet] tv_grab_huro --list-channels --loc [hu | ro] [--icons | (--local-icons DIRECTORY [--no-fetch-icons])] tv_grab_huro --capabilities tv_grab_huro --version =head1 DESCRIPTION Output TV listings for several channels available in Hungary or Romania. The grabber relies on parsing HTML so it might stop working at any time. First run B to choose, which channels you want to download. Then running B with no arguments will output listings in XML format to standard output. B<--configure> Prompt for which channels, and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_huro.conf>. This is the file written by B<--configure> and read when grabbing. B<--gui GUITYPE> Use this option to enable a graphical interface to be used. OPTION may be 'Tk', or left blank for the best available choice. Additional allowed values of OPTION are 'Term' for normal terminal output (default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar. B<--output FILE> Write to FILE rather than standard output. B<--days N> Grab N days. The default is eight. B<--offset N> Start N days in the future. The default is to start from today. B<--quiet> Suppress the progress messages normally written to standard error. B<--slow> Enables long strategy run: port.hu publishes only some (vital) information on the actual listing pages, the rest is shown in a separate popup window. If you'd like to parse the data from these popups as well, supply this flag. But consider that the grab process takes much longer when doing so, since many more web pages have to be retrieved. B<--get-full-description> This is quite like B<--slow> but doesn't always download data from popup window. Instead this is only requested if description in overview is truncated. B<--list-channels> Write output giving elements for every channel available (ignoring the config file), but no programmes. B<--capabilities> Show which capabilities the grabber supports. For more information, see L B<--version> Show the version of the grabber. B<--icons> and B<--local-icons DIRECTORY> get the URL for channel-logos together with the channel-list. With B<--icons> specified the logos(images) will be not fetched just their URL (http://...) will be written in the output XML. If called with B<--local-icons>, the generated URL (file://...) will point to the the local directory DIRECTORY and all channel logos will be grabbed and saved under this place. Use B<--no-fetch-icons> option to disable the icon fetching. Note: icon fetching currently works for Hungary and Romania only. B<--max-desc-length LENGTH> can be used to maximize the length of the grabbed program long description. This can be useful if you have a viewer program (using this xmltv output), which can not be display userfriendly the description if it is more then LENGTH character. B<--help> Print a help message and exit. =head1 SEE ALSO L. =head1 AUTHOR Attila Szekeres and Zsolt Varga. Based on tv_grab_fi by Matti Airas. Heavily patched and earlier maintained by Stefan siegl , reworked and now maintained by Balazs Molnar . =head1 BUGS The data source does not include full channels information and the channels are identified by short names rather than the RFC2838 form recommended by the XMLTV DTD. =cut #------------------------------------------------------------------------------- # initializations #------------------------------------------------------------------------------- use utf8; use strict; use XMLTV; use XMLTV::Version "$XMLTV::VERSION"; use XMLTV::Capabilities qw/baseline manualconfig cache/; use XMLTV::Description 'Hungary/Romania'; use XMLTV::Supplement qw/GetSupplement/; use Getopt::Long; use Date::Manip; use Cwd; use HTML::TreeBuilder; use HTML::Entities; # parse entities use IO::File; use File::Basename; use JSON; use Encode; use Time::Piece (); use Time::Seconds; use XMLTV::Memoize; use XMLTV::ProgressBar; use XMLTV::Ask; use XMLTV::DST; use XMLTV::Get_nice; use XMLTV::Mode; use XMLTV::Config_file; use XMLTV::Date; use XMLTV::Gunzip; # Todo: perhaps we should internationalize messages and docs? use XMLTV::Usage <<"END" $0: get Hungarian or Romanian television listings in XMLTV format To configure: $0 --configure [--config-file FILE] [--gui GUITYPE] To grab listings: $0 [--config-file FILE] [--output FILE] [--days N] [--offset N] [--slow] [--get-full-description] [--max-desc-length LENGTH] [--icons | (--local-icons DIRECTORY [--no-fetch-icons])] [--gui GUITYPE] [--quiet] To list channels: $0 --list-channels --loc [hu | ro] [--icons | (--local-icons DIRECTORY [--no-fetch-icons])] To show capabilities: $0 --capabilities To show version: $0 --version END ; # ${Log::TraceMessages::On} = 1; # to switch TRACE in remove the comment from prev. line # Use Log::TraceMessages if installed. BEGIN { eval { require Log::TraceMessages }; if ($@) { *t = sub {}; *d = sub { '' }; } else { *t = \&Log::TraceMessages::t; *d = \&Log::TraceMessages::d; Log::TraceMessages::check_argv(); } } my ($opt_days, $opt_offset, $opt_help, $opt_output, $opt_configure, $opt_config_file, $opt_gui, $opt_quiet, $opt_list_channels, $opt_loc, $opt_slow, $opt_full_desc, $opt_local_icons, $opt_icons, $opt_no_fetch_icons, $opt_max_desc_length, $opt_worker_times, $opt_now); our $FETCHOFFSET = 0; our ($DAYSPERPAGE, $TZ, $COUNTRY, $CONFIG_FILE, $WNAME, $WSTIME); our (%CATMAP, %JOBMAP, %CHANNELS, %WTIMES); #our %COUNTRIES = (Hungary => [ 'hu', '+0100' ], # Czech => [ 'cz', '+0100' ], # Romania => [ 'ro', '+0200' ], # Slovakia => [ 'sk', '+0100' ]); our %COUNTRIES = (Hungary => [ 'hu', '+0100' ]); our %WORDS = ( cz => { episode => "Epizoda", minute => "minut", links => "Linky" }, hu => { episode => "rész", minute => "perc", links => "linkek" }, ro => { episode => "episodul", # patch #84 minute => "minute", links => "Linkuri" }, sk => { episode => "Epizóda", minute => "minút", links => "Linky" } ) ; our $DEFAULT_ENCODING = 'ISO-8859-2'; our $rating_baseurl = 'http://media.port-network.com/page_elements/'; our %AGE_LIMITS = ( # Todo: insert cz & sk translations 'ageLimitList-1' => [{'hu' => 'korhatárra tekintet nélkül megtekinthető', 'ro' => 'Audienţă generală'}, 'nmhh_akk/mobil_35x35/0_age_icon_mobil.png'], 'ageLimitList-5' => [{'hu' => '16 éven aluliak számára nem ajánlott', 'ro' => 'Acest program este interzis minorilor sub 16 ani'}, 'nmhh_akk/mobil_35x35/16_age_icon_mobil.png'], 'ageLimitList-3' => [{'hu' => '12 éven aluliak számára a megtekintése nagykorú felügyelete mellett ajánlott', 'ro' => 'Acest program este interzis minorilor sub 12 ani'}, 'nmhh_akk/mobil_35x35/12_age_icon_mobil.png'], 'ageLimitList-4' => [{'hu' => '14 éven alul nem ajánlott', 'ro' => 'Acest program este interzis minorilor sub 14 ani'}, 'm_14_age_mini_pix.png'], 'ageLimitList-6' => [{'hu' => '18 éven aluliak számára nem ajánlott', 'ro' => 'Acest program este interzis minorilor sub 18 ani'}, 'nmhh_akk/mobil_35x35/18_age_icon_mobil.png'], 'ageLimitList-8' => [{'hu' => '7 éven aluliak számára nem ajánlott', 'ro' => 'Acest program este interzis minorilor sub 7 ani'}, 'm_7_age_mini_pix.png'], 'ageLimitList-10' => [{'hu' => '6 éven aluliak számára nem ajánlott', 'ro' => 'Acest program este interzis minorilor sub 6 ani'}, 'nmhh_akk/mobil_35x35/6_age_icon_mobil.png'], 'ageLimitList-2' => [{'hu' => 'szülői engedéllyel', 'ro' => 'Recomandat acordul părinţilor'}, 'm_parental_guidance_mini_pix_hu.png'], 'ageLimitList-7' => [{'hu' => '15 éven aluliak számára nem ajánlott', 'ro' => 'Acest program este interzis minorilor sub 15 ani'}, 'm_15_age_mini_pix.png']); our %PROGRAM_CATEGORIES = ( # Todo: insert cz & sk translations 'tvEventType-0' => {'hu' => 'egyéb', 'ro' => 'nedefinit'}, 'tvEventType-11' => {'hu' => 'vallási műsor', 'ro' => 'emisiune religioasă'}, 'tvEventType-4' => {'hu' => 'gyermek műsor', 'ro' => 'copii'}, 'tvEventType-10' => {'hu' => 'dokumentumfilm', 'ro' => 'documentar'}, 'tvEventType-12' => {'hu' => 'filmsorozat', 'ro' => 'serial'}, 'tvEventType-13' => {'hu' => 'szabadidős műsor', 'ro' => 'family'}, 'tvEventType-14' => {'hu' => 'zenei műsor', 'ro' => 'muzica'}, 'tvEventType-15' => {'hu' => 'hírműsor', 'ro' => 'ştiri'}, 'tvEventType-1' => {'hu' => 'sportműsor', 'ro' => 'sport'}, 'tvEventType-3' => {'hu' => 'hír-, politikai műsor', 'ro' => 'tv show'}, 'tvEventType-7' => {'hu' => 'művészeti műsor', 'ro' => 'tv show'}, 'tvEventType-8' => {'hu' => 'ismeretterjesztő műsor', 'ro' => 'stiinta'}, 'tvEventType-9' => {'hu' => 'szappanopera', 'ro' => 'telenovelă'}, 'tvEventType-18' => {'hu' => 'gasztronómiai műsor', 'ro' => 'gastro'}, 'tvEventType-20' => {'hu' => 'életstílus', 'ro' => 'life style'}, 'tvEventType-2' => {'hu' => 'film', 'ro' => 'film'}, 'tvEventType-5' => {'hu' => 'szórakoztató műsor', 'ro' => 'reality show'}, 'tvEventType-6' => {'hu' => 'szolgáltató műsor', 'ro' => 'tv show'}, 'tvEventType-16' => {'hu' => 'divat', 'ro' => 'modă'}, 'tvEventType-17' => {'hu' => 'felnőtt', 'ro' => 'pentru adulţi'}, 'tvEventType-19' => {'hu' => 'reality', 'ro' => 'reality-show'}); sub domain(); sub xid( $ ); sub xhead(); sub process_table( $$$$ ); sub process_json( $$$$ ); sub parse_short_desc ( $ ); sub get_channels( ;$ ); sub get_channels_json( ;$ ); sub get_infourl_data( $$ ); sub get_infourl_data_json( $$ ); sub add_person ( $$$ ); sub extract_episode( $ ); sub grab_icon( $ ); sub get_channel_urls( $ ); sub worker( $ ); sub showworkers(); sub get_all_text ( $ ); # Get options, including undocumented --cache option. XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); #------------------------------------------------------------------------------- # domain #------------------------------------------------------------------------------- # desc : construct the www host's hostname's domain part # arguments : none # returns : "port.hu", "port.ro" based on country id #------------------------------------------------------------------------------- sub domain() { "port.$COUNTRY" }; #------------------------------------------------------------------------------- # xid #------------------------------------------------------------------------------- # desc : turn a site channel id into an XMLTV id # arguments : 1- port site channel id: 005 (f.e) # returns : port.hu, port.ro based on country id #------------------------------------------------------------------------------- sub xid( $ ) { my $id = shift; return "$id." . domain(); } #------------------------------------------------------------------------------- # xhead #------------------------------------------------------------------------------- # desc : provide the head of the XML output # arguments : none # returns : hash, containing the info's XML tags #------------------------------------------------------------------------------- sub xhead() { my $d = &domain; return { 'source-info-url' => "https://www.$d/", 'source-data-url' => "https://www.$d/tv/", 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://xmltv.org/', }; } # function to parse all of text data of a HTML element and his childs: #------------------------------------------------------------------------------- # get_all_text #------------------------------------------------------------------------------- # desc : parse all of text data of a HTML element and in his childs # arguments : 1- HTML:Element object from here will be started the downhill # returns : @arrays of founded text elements #------------------------------------------------------------------------------- sub get_all_text( $ ) { my @tmplines; my $e = $_[0]; if (ref $e) { foreach my $c ($e->content_list) { push @tmplines, get_all_text($c); } } else { push @tmplines, $e; } return @tmplines; } #------------------------------------------------------------------------------ # process_table #------------------------------------------------------------------------------ # desc : fetch a URL and process it # arguments : 1- Date::Manip object, basedate/startdate of grabbing (e.g. 20060205) # 2- xmltv id of channel # 3- site id of channel # 4- dayindex of the requested page on port.hu|ro # returns : list of the programme hashes to write #------------------------------------------------------------------------------ sub process_table( $$$$ ) { my ($basedate, $ch_xmltv_id, $ch_port_id, $baseday) = @_; $basedate = UnixDate(DateCalc(parse_date($basedate),"- 1 day"), '%Q'); my $days_to_request = $DAYSPERPAGE+1; # We have to request at minimum of four days $days_to_request = 4 if ($days_to_request<4); my $basedateday = UnixDate(parse_date($basedate), '%e'); my $urlfmt = "http://www." . domain() . "/pls/tv/".($COUNTRY eq 'hu' || $COUNTRY eq 'ro' ? 'old' : '')."tv.channel?i_ch=$ch_port_id" . "&i_date=%d&i_xday=".$days_to_request."&i_where=1"; # bug #501 my $url = "$urlfmt"; my $currday = $baseday + $FETCHOFFSET - 1; my ($tree, $body); local $SIG{__WARN__} = sub { warn "$url: $_[0]"; }; # make (maximum) two loop to fetch program data: # # if the grabber runs in eraly hours (e.g. 01:00, 02:00) port.hu returns # the yesterdays's program as today's program... so we have to check it, # example.hu: Péntek (február 27.) # example.ro: Duminic\u0103 (26 februarie) # if this failes, we construct the previous (or next?) day's url foreach (1, 2) { $url = sprintf($urlfmt, $basedate); t "fetching url: $url"; worker("base-downloading"); $XMLTV::Get_nice::FailOnError = 1; my $data=get_nice($url); # strip links to bet-at-home.com # bug #447 $data =~ s|\x0A||g; # strip new line $data =~ s|||g; $data =~ s|||g; # strip links to divido.hu and provideo.ro $data =~ s|||g; $tree = HTML::TreeBuilder->new_from_content($data) or die "could not fetch/parse $url (progamtable)\n"; worker("base-parsing"); $body = $tree->look_down("_tag"=>"body"); my @daysonpage = () ; foreach ($body->look_down("_tag"=>"p", "class" => "date_box")) { $_ = $_->as_text(); if ((($COUNTRY eq 'hu') && (m/^\s*\S+\s+\(\s*\S+\s+(\d+)\.\s*\)\s*$/)) || (($COUNTRY eq 'ro') && (m/^\s*\S+\s+\(\s*(\d+)\s+\S+\s*\)\s*$/)) || (($COUNTRY eq 'sk' || $COUNTRY eq 'cz') && (m/^\s*\S+\s+\(\s*(\d+)\.\s+\S+\s*\)\s*$/))) { t "added founded date of the month on the grabbed page: $1"; push @daysonpage, $1; } } if (@daysonpage) { # check date ... is the first founded date on the page the requested? last if ($basedateday == $daysonpage[0]); $body = undef; $tree->delete(); t "requested from $basedate, but port.$COUNTRY returned programs from wrong day: $daysonpage[0]"; if (UnixDate(DateCalc($basedate, "- 1 days"), '%e') == $daysonpage[0]) { # port.hu returned the programms from yesterday $FETCHOFFSET += 1 ; } elsif (UnixDate(DateCalc($basedate, "+ 1 days"), '%e') == $daysonpage[0]) { # port.hu returned the programms from tommorrow $FETCHOFFSET -= -1 ; } else { t "fetched HTML page do not contain 0, +1 or -1 day of the reuested one"; last; } t "global fetch offset was set to: $FETCHOFFSET"; } else { warn "no date data found on the fetched HTML page, trying to continue"; last; } } if (! defined($body)) { warn "Could not found the requested day's data on the grabbed HTML page, " . "some programs on $ch_xmltv_id channel will be not fetched."; return; } # the page consists of two major tables, where one holds the data # until 'bout 20 o'clock, the other, i.e. lower, one the evening program # the programs are in statements in more tables, this tables are # intermited with other tables, which contain images # - we need only the rows, which hold program data; because tables are # structured into other tables, we need only the most inner row # -> if it contains program data, and # -> has no child-tables in it # - there are tables, which will be the lower region delimiter # this is most inner and contains the vonal.gif, we will add this row # only as "lower region" string my @rows; my $empty_first_column; foreach ($body->look_down("_tag"=>"tr")) { if ($_->look_down("_tag"=>"img", "src" => "/tv/kep/vonal.gif")) { # ther is the "vonal.gif" in the table? t "+ most inner row containing vonal.gif found"; push @rows, "lower region"; } # look for line.gif as separator as well, per zolih@hotmail.com elsif ($_->look_down("_tag"=>"img", "src" => qr/\/line.gif$/)) { # ther is the "vonal.gif" in the table? t "+ most inner row containing vonal.gif found"; # in some cases the lower part of the first column is empty (See ticket #2890433) # We're trying to identify this case if (! scalar $_->look_down("_tag"=>"table")->look_down("_tag"=>"td", class=>"time_container")) { t "first column is empty"; $empty_first_column = 1; } push @rows, "lower region"; } # have childrens? if yes, skip this row next if ($_->look_down("_tag"=>"table")); # actual show is duplicated sometimes, and one of them is hidden next if ($_->parent()->attr("class") && $_->parent()->attr("class") eq "actual_showtime_hide"); if ($_->as_text() =~ /[012]?[0-9]:[0-5][0-9]/ || ($_->attr("class") && $_->attr("class") =~ /^event_/)) { t "+ most inner row containing programs found:"; t $_->as_text(); push @rows, $_; } } # walk through the rows to create programs # if you grab channel programs for 3 days in one fetch, you will have # the order of the rows is, how thay appear in the output, so the # 1-day, 2-day, 3-day, 1-night, 2-night-part1, 2-night-part2, 3-night my @programs; my $region = "upper"; my $startdate = $basedate; my $lasttime = 0; my $lasttimeHHMM = ""; # we need all the rows, because this is a program record: foreach my $row (@rows) { # check whether if we are first in lower tables ... -> reset date if (! ref($row) && $row eq "lower region") { if ($region eq "upper" && @programs) { t "upper/lower delimiter found, setting date back to startdate ($basedate)"; $startdate = $basedate; $lasttime = 12 * 60; # bottom row start after 18 o'clock, this gives us some safety, if the current show indicator is around $currday = $baseday + $FETCHOFFSET - 1; $region = "lower"; # in some cases the lower part of the first column is empty (See ticket #2890433) # In this case we have to skip the first day if ($empty_first_column) { t "first column is empty, bumped to next day"; $currday++; $startdate = UnixDate(DateCalc(parse_date($startdate), "+ 1 day"), '%Q'); } } next; } my (@urls, %program); foreach my $col ($row->look_down("_tag"=>"td")) { # the column can hold following type of data: # begin time | title | long desc | url | category my $begin_time; # this matches the currently running programme only: if ($col->attr("colspan") && ($begin_time = $col->look_down("_tag"=>"p", "class"=>"begin_time" ))) { $_ = $begin_time->as_text(); } elsif ($begin_time = $col->look_down("_tag"=>"td", "class"=>"time_container")) { $_ = $begin_time->look_down("_tag"=>"div")->as_text(); } else { $_ = $col->as_text(); $_ =~ tr/\xA0/ /; $_ =~ s/^\s+//; $_ =~ s/\s+$//; } s/^\s+//;s/\s+$//; # port.hu makes sometimes empty td elements next if (! length); t "col contents as text:" . d $_; if (m/^([012]?[0-9]):([0-5][0-9])$/) { s/^Kb[.]//; # means 'approx' in Magyar s/^24:/00:/; my $time = $1 * 60 + $2; if ($time < $lasttime) { t "bumped to the the next day"; $currday++; $startdate = UnixDate(DateCalc(parse_date($startdate), "+ 1 day"), '%Q'); die if not defined $startdate; } $lasttime = $time; # Fix the time format to be suitable for sorting $program{time} = length($_) == 4 ? "0".$_ : $_ ; $program{startdate} = $startdate ; $lasttimeHHMM = $program{time}; } else { my @span; if (@span = $col->look_down("class"=>"btxt")) { $program{title} = $span[0]->as_text(); } elsif (@span = $col->look_down("class"=>"lbbtxt")) { $program{title} = $span[0]->as_text(); } else { warn "cannot found title: $startdate" ; } # add one space after the title, if there is none # ??? $program{title} = ' ' if $program{title} eq ''; # bug #408 my @tmp = get_all_text($col); $_ = join(' ', @tmp); s/ +/ /g; s/[^\w]*putbox\(\"[0-9][0-9]\"\)[\s\n\r]*//g; s/Megvásárolható (DVD[ ]?-n|VHS[ ]?-en)//g; # strip leading   (and other spaces) s/^[ \t\xA0]*//g; # turn left over   into normal ones # (also take care of tabs and multiple spaces while here) s/[ \t\xA0]+/ /g; $program{desc} = $_ if length($_); $program{day} = $currday; foreach my $a ($col->look_down("_tag"=>"a")) { push @urls, $a->attr(q(href)); } # support ticket #202 # get the rating if available # (15) if (my $img = $span[0]->parent()->look_down('_tag' => 'img', 'class' => 'age_limit_icon')) { t "age limit icon found"; my $rating = $img->attr('alt'); $rating =~ s/[\(\)]//g; # strip the brackets my $rating_icon = $img->attr('src'); t "age rating = $rating"; $program{rating} = [[ $rating, '', [{'src' => $rating_icon }] ]]; } } } # foreach $col # New type of row: contains only the time of the event # but not the actual program data. So in this case we try to # jump to the next row if (! $row->attr("class") || (not $row->attr("class") =~ /[\s]*event_/ )) { t "found line without event, the time is: $lasttimeHHMM"; next; } if (! $program{time} && $lasttimeHHMM) { # Fix the time format to be suitable for sorting $program{time} = $lasttimeHHMM; $program{startdate} = $startdate ; } # add all parsed info, as program{time, title, desc, category, date} $program{infourl} = \@urls if (@urls); parse_short_desc(\%program); t "pushing ".$program{title}; push @programs, \%program if (defined $program{time}) && (defined $program{title}); } # foreach $row $tree->delete; # get rid of HTML::TreeBuilder's in memory representation if (not @programs) { warn "no programs found, skipping\n"; return (); } # make a sort on programs, short compare function: cmp startdate, time # stringwise (this gives the same rsult as comapre datewise) # Date_Cmp(UnixDate($left->{time},'%H:%M'),UnixDate($right->{time},'%H:%M'); sub bytime { ($a->{startdate}.$a->{time}) cmp ($b->{startdate}.$b->{time}); } @programs = sort bytime @programs; t "programs in sorted order:"; t "program:" . d $_ foreach (@programs); my (@r, $prev); # assume lang == country my $lang = $COUNTRY; foreach my $program (@programs) { my $prog; $prog->{channel}=$ch_xmltv_id; $prog->{title}=[ [ $program->{title}, $lang ] ]; my $start=parse_local_date("$program->{startdate} $program->{time}", $TZ); my ($start_base, $start_tz) = @{date_to_local($start, $TZ)}; $prog->{start}=UnixDate($start_base, '%q') . " $start_tz"; $prog->{desc} = [[ $program->{desc}, $lang ]] if defined $program->{desc}; $prog->{category} = $program->{category} if (defined $program->{category}); $prog->{date} = $program->{date} if defined $program->{date}; $prog->{qw(episode-num)} = $program->{qw(episode-num)} if defined $program->{qw(episode-num)}; $prog->{length} = $program->{length} if defined $program->{length}; $prog->{rating} = $program->{rating} if defined $program->{rating}; # support #202 # Setting stop date for the previous programm # Last program in the grabbed list has no stop attribute, sorry. # Port.hu uses a virtual program as the last programme # anyway if ((defined($prev)) && $prev->{start} ne $prog->{start}) { $prev->{stop} = $prog->{start}; } # We skip those programs, that are out of the requested time frame if ($program->{day} > $opt_days+$opt_offset || $program->{day}<1+$opt_offset) { $prev = undef; next; } worker("slow-parsing"); foreach my $infourl (@{$program->{infourl}}) { # always read data from linked page (in --slow mode) # in --get-full-description mode read if description ends in '...' if ( ($opt_slow) || ($opt_full_desc && (defined $prog->{desc}) && ($prog->{desc})->[0]->[0] =~ m/\.\.\.$/) ) { get_infourl_data($prog, $infourl); } } worker("base-parsing"); push @r, $prog; if ((defined($prev)) && $prev->{start} eq $prog->{start}) { # starttime of previous and current programme is equal, # therefore use clumpidx to express relation my $clumps_num = 2; if (defined($r[-2]->{q(clumpidx)})) { # previous programme already has a clumpidx arg assigned. ($clumps_num) = $r[-2]->{q(clumpidx)} =~ m|^\d+/(\d+)$|; } # okay, assign new clumpidx values ... for (0 .. ($clumps_num-1)) { $r[-$clumps_num+$_]->{q(clumpidx)} = "$_/$clumps_num"; } } $prev = $prog; } return @r; } #------------------------------------------------------------------------------ # process_json #------------------------------------------------------------------------------ # desc : fetch a URL and process it # arguments : 1- Date::Manip object, basedate/startdate of grabbing (e.g. 20060205) # 2- xmltv id of channel # 3- site id of channel # 4- dayindex of the requested page on port.hu|ro # returns : list of the programme hashes to write #------------------------------------------------------------------------------ sub process_json( $$$$ ) { my ($basedate, $ch_xmltv_id, $ch_port_id, $baseday) = @_; # $basedate = UnixDate(DateCalc(parse_date($basedate),"- 1 day"), '%Q'); $basedate = UnixDate(parse_date($basedate), '%Q'); my $days_to_request = $DAYSPERPAGE; my $basedateday = UnixDate(parse_date($basedate), '%e'); my $to_date = Time::Piece->strptime( $basedate, '%Y%m%d'); $to_date += ONE_DAY * $days_to_request; $ch_port_id =~ s/^0+//; my $d = domain(); my $urlfmt = "https://" . $d . (($COUNTRY eq 'hu') ? "/tvapi?channel_id=tvchannel-" : "/pls/w/tv_api.event_list?i_channel_id=").$ch_port_id. "&i_datetime_from=%s&i_datetime_to=".$to_date->strftime('%Y-%m-%d'); my $url = "$urlfmt"; local $SIG{__WARN__} = sub { warn "$url: $_[0]"; }; my $json_data; my $lang = $COUNTRY; # make (maximum) two loop to fetch program data: # # if the grabber runs in early hours (e.g. 01:00, 02:00) port.hu returns # the yesterdays's program as today's program... so we have to check it, # example.hu: Péntek (február 27.) # example.ro: Duminic\u0103 (26 februarie) # if this fails, we construct the previous (or next?) day's url my @daysonpage = (); foreach (1, 2) { $url = sprintf($urlfmt, UnixDate($basedate, '%Y-%m-%d')); t "fetching url: $url"; worker("base-downloading"); $XMLTV::Get_nice::FailOnError = 1; my $data=get_nice($url); $data =~ s/<\/?span[^>]{0,}>\s?//g; # remove html elements $json_data = (($DEFAULT_ENCODING !~ /utf\-?8/i) && ($COUNTRY eq 'hu')) ? JSON->new->utf8(0)->decode(encode($DEFAULT_ENCODING,decode('utf-8', $data))) : JSON->new->utf8(0)->decode($data) or die "could not fetch/parse $url (json structure)\n"; worker("base-parsing"); foreach my $act_secs (sort(keys(%{$json_data}))) { my $wday = UnixDate(($COUNTRY eq 'hu') ? $json_data->{$act_secs}->{'date_from'} : $json_data->{$act_secs}->{'datetime_from'}, '%d'); t "added founded date of the month on the grabbed page: $wday"; push @daysonpage, $wday; } if (@daysonpage) { # check date ... is the first founded date on the page the requested? last if ($basedateday == $daysonpage[0]); t "requested from $basedate, but port.$COUNTRY returned programs from wrong day: $daysonpage[0]"; if (UnixDate(DateCalc($basedate, "- 1 days"), '%e') == $daysonpage[0]) { # port.hu returned the programms from yesterday $FETCHOFFSET += 1 ; } elsif (UnixDate(DateCalc($basedate, "+ 1 days"), '%e') == $daysonpage[0]) { # port.hu returned the programms from tommorrow $FETCHOFFSET -= -1 ; } else { t "fetched HTML page do not contain 0, +1 or -1 day of the reuested one"; last; } t "global fetch offset was set to: $FETCHOFFSET"; } else { warn "no date data found on the fetched HTML page, trying to continue"; last; } } if (! defined($json_data)) { warn "Could not found the requested day's data on the grabbed JSON structure, " . "some programs on $ch_xmltv_id channel will be not fetched."; return; } my @programs; # JSON structure # => # { # 'date_from' => 'YYYY-MM-DDTHH:MM:SS+HH:MM' # 'date_to' => 'YYYY-MM-DDTHH:MM:SS+HH:MM' # 'channels' => [{ # 'date' => 'YYYY-MM-DDTHH:MM:SS+HH:MM' (date of query) # 'date_from' => 'YYYY-MM-DDTHH:MM:SS+HH:MM' # 'date_until' => 'YYYY-MM-DDTHH:MM:SS+HH:MM' (equals with date_to) # 'domain' => 'port.hu' # 'id' => 'tvchannel-N' # 'name' => 'Channel Name' # ... # 'programs' => [{ # 'id' => 'event-t-NNNNNNNN' # 'start_ts' => # 'start_datetime' => 'YYYY-MM-DDTHH:MM:SS+HH:MM' # 'end_datetime' => 'YYYY-MM-DDTHH:MM:SS+HH:MM' # 'start_time' => 'HH:MM' # 'end_time' => 'HH:MM' # 'title' => 'Show title' # 'episode_title' => 'Ep title' or undef # 'short_description' => 'Short desc.' or undef # 'description' => 'Desc.' or undef # 'film_url' => '/adatlap/film/tv/...' # 'restriction' => { 'category' => 'tvEventType-N', 'age_limit' => 'ageLimitList-N' ] # 'attributes_text' => '(ism.)' / '(élő)' / '(DS)' etc. or '' # 'italics' => 'Feliratozva ...' or undef # ... # # }] # }] # } foreach my $act_secs (sort(keys(%{$json_data}))) { my $all_prog_data = $json_data->{$act_secs}->{'channels'}[0]->{'programs'}; foreach my $prog_data (@{$all_prog_data}) { my %program; $program{startdate} = UnixDate($prog_data->{'start_datetime'}, '%Q'); my $currday = $program{startdate}; $currday =~ s/^[0-9]{6}([0-9]{2})$/$1/; $program{day} = $currday - $daysonpage[0] + 1; # We skip those programs, that are out of the requested time frame if ($program{day} != 1) { next; } $program{time} = $prog_data->{'start_time'}; $program{time} =~ s/^([012]?[0-9]):([0-5][0-9])$/$1$2/; $program{enddate} = UnixDate($prog_data->{'end_datetime'}, '%Q') if (defined($prog_data->{'end_datetime'})); t "--- missing end_time (".$prog_data->{'title'}.") ".d $prog_data if (!defined($prog_data->{'end_time'})); $program{endtime} = $prog_data->{'end_time'}; $program{endtime} =~ s/^([012]?[0-9]):([0-5][0-9])$/$1$2/ if (defined($program{endtime})); $program{title} = (defined($prog_data->{'title'}) && $prog_data->{'title'} ne "") ? $prog_data->{'title'} : ' '; if ($prog_data->{'short_description'}) { $program{desc} = $prog_data->{'short_description'}; } elsif ($prog_data->{'description'}) { $program{desc} = $prog_data->{'description'}; } # if ($prog_data->{'attributes_text'}) { # $program{desc} = ($program{desc}) ? $program{desc}.', '.$prog_data->{'attributes_text'} : $prog_data->{'attributes_text'}; # } if ($prog_data->{'episode_title'}) { if (($program{desc}) && ($program{desc} =~ /$WORDS{$COUNTRY}->{episode}/)) { $program{desc} =~ s/($WORDS{$COUNTRY}->{episode})/$1, $prog_data->{'episode_title'}/; } else { $program{desc} = ($program{desc}) ? $program{desc}.', '.$prog_data->{'episode_title'} : $prog_data->{'episode_title'}; } } if ($program{desc}) { $program{desc} =~ s/ +/ /g; $program{desc} =~ s/Megvásárolható (DVD[ ]?-n|VHS[ ]?-en)//g; } if ($prog_data->{'film_url'}) { my @url = ((($COUNTRY eq 'hu') ? 'https://'.domain() : '').$prog_data->{'film_url'}); $program{infourl} = \@url; } # support ticket #202 # get the rating if available my $actcat = ""; if ($prog_data->{'restriction'}) { $prog_data->{'restriction'}->{'age_limit'} =~ s/(\d+)/ageLimitList-$1/ if (($COUNTRY eq 'ro') && ($prog_data->{'restriction'}->{'age_limit'} !~ /ageLimitList/)); $prog_data->{'restriction'}->{'category'} =~ s/(\d+)/tvEventType-$1/ if (($COUNTRY eq 'ro') && ($prog_data->{'restriction'}->{'category'} !~ /tvEventType/)); if (($prog_data->{'restriction'}->{'age_limit'}) # 'ageLimitList-N' && ($AGE_LIMITS{$prog_data->{'restriction'}->{'age_limit'}})) { my $rating = ($DEFAULT_ENCODING !~ /utf\-?8/i) ? encode($DEFAULT_ENCODING, $AGE_LIMITS{$prog_data->{'restriction'}->{'age_limit'}}[0]->{$lang}) : $AGE_LIMITS{$prog_data->{'restriction'}->{'age_limit'}}[0]->{$lang}; $program{rating} = [[ $rating, '', [{'src' => $rating_baseurl.$AGE_LIMITS{$prog_data->{'restriction'}->{'age_limit'}}[1] }] ]]; } if (($prog_data->{'restriction'}->{'category'}) # 'tvEventType-N' && ($PROGRAM_CATEGORIES{$prog_data->{'restriction'}->{'category'}})) { $actcat = ($DEFAULT_ENCODING !~ /utf\-?8/i) ? encode($DEFAULT_ENCODING, $PROGRAM_CATEGORIES{$prog_data->{'restriction'}->{'category'}}->{$lang}) : $PROGRAM_CATEGORIES{$prog_data->{'restriction'}->{'category'}}->{$lang}; $program{desc} = ($program{desc}) ? $program{desc}.', '.$actcat : $actcat; # insert actual category - see parse_short_desc } } # add all parsed info, as program{time, title, desc, category, date} parse_short_desc(\%program); if ((defined($program{desc})) && ($program{desc} =~ $actcat)) { # remove actual category $program{desc} =~ s/[,\ ]{0,2}$actcat$//; delete($program{desc}) if (!length($program{desc})); } worker("slow-parsing"); foreach my $infourl (@{$program{infourl}}) { # always read data from linked page (in --slow mode) # in --get-full-description mode read if description ends in '...' if ( ($opt_slow) || ( $opt_full_desc && ( ((defined $program{desc}) && ($program{desc} =~ m/\.\.\.$/) ) || ((!$program{desc}) && (!$program{category}) && ($actcat eq $PROGRAM_CATEGORIES{'tvEventType-2'}->{$lang})) ) ) ) { # program without desc & category + actcat = 'film'? get_infourl_data_json(\%program, $infourl); # parse_short_desc(\%program); } } t "pushing ".$program{title}; push @programs, \%program if (defined $program{time}) && (defined $program{title}); } # foreach $prog_data } # foreach $json_data if (not @programs) { warn "no programs found, skipping\n"; return (); } # make a sort on programs, short compare function: cmp startdate, time # stringwise (this gives the same rsult as comapre datewise) # Date_Cmp(UnixDate($left->{time},'%H:%M'),UnixDate($right->{time},'%H:%M'); # sub bytime { # ($a->{startdate}.$a->{time}) cmp ($b->{startdate}.$b->{time}); # } @programs = sort {$a->{startdate}.$a->{time} cmp $b->{startdate}.$b->{time}} @programs; t "programs in sorted order:"; t "program:" . d $_ foreach (@programs); my (@r, $prev); # assume lang == country # my $lang = $COUNTRY; foreach my $program (@programs) { my $prog; $prog->{channel}=$ch_xmltv_id; $prog->{title}=[ [ $program->{title}, $lang ] ]; my $start=parse_local_date("$program->{startdate} $program->{time}", $TZ); my ($start_base, $start_tz) = @{date_to_local($start, $TZ)}; $prog->{start}=UnixDate($start_base, '%q') . " $start_tz"; if (defined($program->{enddate}) && defined($program->{endtime})) { my $stop=parse_local_date("$program->{enddate} $program->{endtime}", $TZ); my ($stop_base, $stop_tz) = @{date_to_local($stop, $TZ)}; $prog->{stop}=UnixDate($stop_base, '%q') . " $stop_tz"; } else { t "--- missing enddate + endtime - ch: ".$prog->{channel}.", title: ".$program->{title}; } $prog->{desc} = [[ $program->{desc}, $lang ]] if defined $program->{desc}; $prog->{category} = $program->{category} if (defined $program->{category}); $prog->{date} = $program->{date} if defined $program->{date}; $prog->{qw(episode-num)} = $program->{qw(episode-num)} if defined $program->{qw(episode-num)}; $prog->{length} = $program->{length} if defined $program->{length}; $prog->{rating} = $program->{rating} if defined $program->{rating}; # support #202 # Setting stop date for the previous program # Last program in the grabbed list has no stop attribute, sorry. # Port.hu uses a virtual program as the last program # anyway if ((defined($prev)) && ((!defined($prev->{stop})) || $prev->{stop} ne $prog->{start})) { if ($prev->{start} ne $prog->{start}) { $prev->{stop} = $prog->{start}; } else { t "--- remove previous program: ".d @r; pop(@r); } } worker("base-parsing"); push @r, $prog; # if ((defined($prev)) && $prev->{start} eq $prog->{start}) { # starttime of previous and current programme is equal, # therefore use clumpidx to express relation # my $clumps_num = 2; # if (defined($r[-2]->{q(clumpidx)})) { # previous programme already has a clumpidx arg assigned. # ($clumps_num) = $r[-2]->{q(clumpidx)} =~ m|^\d+/(\d+)$|; # } # okay, assign new clumpidx values ... # for (0 .. ($clumps_num-1)) { # $r[-$clumps_num+$_]->{q(clumpidx)} = "$_/$clumps_num"; # } # } $prev = $prog; } return @r; } #------------------------------------------------------------------------------- # parse_short_desc #------------------------------------------------------------------------------- # desc : parse the short description of a program, founded on the program # listing page (this is mostly 1-2 lines ~ 120 characters), but # sometimes contains categrory, date, length # arguments : 1- reference to a program HASH, there is the grabbed description in it # and there should be attached the other newly found informations, # such as: # ( category => [ [Animals, en], [Természet, hu], [..], ... ] # date => 2001 ) # returns : none #------------------------------------------------------------------------------- sub parse_short_desc ($) { my $prog = shift; my (%result, $desc, $cont, $episode, $minutes, $year, @categories); if ((defined $prog->{desc}) && length($prog->{desc})) { $desc = $prog->{desc}; } else { return } # 1: if there is () in the desc grab from there # 2: if no () found, try in the first 120 character # # examples: # Hegylako - A hollo (amerikai-francia-kanadai kalandfilmsorozat, 1998) # Lisa. Animációs sorozat. # Slayers - A kis boszorkány. (12). Japan animacios sorozat. # # sometimes only the proposed minimal age of watching person is # presented in parentheses eg: (12), so parse this only if it is # longer as for example 6 (4 is not enough, because (ism.) is no category...) if (! (($cont) = $desc =~ m/[^\(]*\(([^\)]{6,})\)/)) { $cont = substr($desc, 0, (length($desc) < 120 ? length($desc) : 120)); } t "parse_short_desc: text: '$cont'"; $WORDS{$COUNTRY}->{episode}="zdontmatchz" unless exists $WORDS{$COUNTRY}->{episode}; t "episode (country: $COUNTRY): ".(defined($WORDS{$COUNTRY}->{episode}) ? $WORDS{$COUNTRY}->{episode} : "undef"); # port.hu episode style with season (# patch #80) if ($cont =~ /\s*([IVX]+\.?\s?\/\s?[0-9]+)\. $WORDS{$COUNTRY}->{episode}/) { $episode = $1; } # port.hu episode style without season elsif ($cont =~ /\s*([0-9\/]+)\. $WORDS{$COUNTRY}->{episode}/) { $episode = $1; } # port.ro episode style with season (# patch #74) elsif ($cont =~ /$WORDS{$COUNTRY}->{episode} \s*([0-9]+, [A-Za-z]+ [0-9])/) { $episode = $1; } # port.cz/.sk episode style with season elsif ($cont =~ /$WORDS{$COUNTRY}->{episode} \s*([0-9]+, [IVX]+)\./) { $episode = $1; } # port.cz/.sk episode style for two episodes back to back in one slot elsif ($cont =~ /$WORDS{$COUNTRY}->{episode} \s*(\d+, \d+)/) { $episode = $1; } # port.cz/.sk/.ro episode style without season elsif ($cont =~ /$WORDS{$COUNTRY}->{episode} \s*([0-9\/]+)/) { $episode = $1; } if ($cont =~ /\s*(\d+)'/) { $minutes = $1; } if ($cont =~ /\(.*?((?:19|20)[0-9]{2})/) { $year = $1; } # bug #448 elsif ($cont =~ /$WORDS{$COUNTRY}->{episode},\s((?:19|20)[0-9]{2}),\s/) { $year = $1; } # ex.: '... II / 4. rész, 2016, ...' t "found episode: '$episode'" if defined $episode; t "found minutes: '$minutes'" if defined $minutes; t "found year: '$year'" if defined $year; # Sort the category keys so they appear in consistent order foreach (sort keys %CATMAP) { next unless defined $CATMAP{$_}; # bug #443 if ($cont =~ /$CATMAP{$_}[0]/i) { push @categories, [$_, "en"]; push @categories, [$CATMAP{$_}[1], $COUNTRY]; t "found category: '$_'"; } } $prog->{q(category)} = \@categories if @categories; $prog->{q(length)} = $minutes * 60 if defined $minutes; $prog->{q(date)} = $year if defined $year ; $prog->{q(episode-num)} = extract_episode( $episode ) if defined $episode ; } #------------------------------------------------------------------------------- # get_nice_gzip #------------------------------------------------------------------------------- # desc : get url with get_nice, check for gzip encoding, return # decoded if necessary # arguments : url to fetch from # returns : html as scalar #------------------------------------------------------------------------------- sub get_nice_gzip( $ ) { my $data = get_nice($_[0]); # Use heuristics to check for valid html if ($data =~ / # ( 'display-name' => [ [ $channel_name, $COUNTRY ] ], # 'id' => "$channel_id.$d", # 'icon' => [ { src => $iconurl } ] ) #------------------------------------------------------------------------------- sub get_channels( ;$ ) { my $mode = shift; my $d = domain(); my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my $url="https://www.$d/pls/tv/".($COUNTRY eq 'hu' || $COUNTRY eq 'ro' ? 'old' : '')."tv.prog"; # bug #501 worker("base-downloading"); t "fetching $url..."; $XMLTV::Get_nice::FailOnError = 1; my $data = get_nice_gzip($url); my $tree = HTML::TreeBuilder->new_from_content($data) or die "could not fetch/parse $url (channel listing)"; worker("base-parsing"); my @menus = $tree->find_by_tag_name("_tag"=>"select"); foreach my $elem (@menus) { my $cname = $elem->attr('name'); $cname = '' if (!$cname); if ($cname eq "i_ch") { my @ocanals = $elem->find_by_tag_name("_tag"=>"option"); @ocanals = sort @ocanals; foreach my $opt (@ocanals) { my %channel; if (not $opt->attr('value') eq "") { my $channel_id = $opt->attr('value'); my $channel_name = $opt->as_text; if (length $channel_id eq 1) { $channel_id = "00" . $channel_id } if (length $channel_id eq 2) { $channel_id = "0" . $channel_id } # Assume country code and lang. code the same. %channel = ( 'display-name' => [ [ $channel_name, $COUNTRY ] ], 'id' => "$channel_id.$d" ) ; if (!defined $mode || $mode ne 'grab') { # no point doing this for 'grab' # fetch and get icon url worker("base-downloading"); if (my $iconurl = grab_icon( $channel_id )) { $channel{'icon'} = [ { src => $iconurl } ]; } worker("base-parsing"); } $CHANNELS{$channel_id} = \%channel; } } } } die "no CHANNELS could be found" if not %CHANNELS; update $bar if not $opt_quiet; $bar->finish() if not $opt_quiet; t "CHANNELS:" . d \%CHANNELS; } #------------------------------------------------------------------------------- # get_channels_json #------------------------------------------------------------------------------- # desc : get channel listing for a country # arguments : none # returns : sets global CHANNELS hash to the grabbed channels: # ( '$channel_id' => # ( 'display-name' => [ [ $channel_name, $COUNTRY ] ], # 'id' => "$channel_id.$d", # 'icon' => [ { src => $iconurl } ] ) #------------------------------------------------------------------------------- sub get_channels_json( ;$ ) { my $mode = shift; my $d = domain(); my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my $url="https://www.$d/".(($COUNTRY eq 'hu') ? "tvapi/init" : "pls/w/tv_api.init?i_page_id=1"); worker("base-downloading"); t "fetching $url..."; $XMLTV::Get_nice::FailOnError = 1; binmode(STDOUT,":encoding(UTF-8)"); my $data = get_nice($url); my $json_data = (($DEFAULT_ENCODING !~ /utf\-?8/i) && ($COUNTRY eq 'hu')) ? JSON->new->utf8(0)->decode(encode($DEFAULT_ENCODING,decode('utf-8', $data))) : JSON->new->utf8(0)->decode($data) or die "could not fetch/parse $url (channel listing)"; worker("base-parsing"); foreach my $ch (@{$json_data->{'channels'}}) { my $channel_id = $ch->{'id'}; $channel_id =~ s/^[^0-9]+//; # 'tvchannel-N' -> 'N' $channel_id = sprintf("%03d", $channel_id); # 'N' -> '00N' my @urls = (($COUNTRY eq 'hu') ? 'https://'.domain() : '').$ch->{'link'}; my %channel = ( 'display-name' => [ [ $ch->{'name'}, $COUNTRY ] ], 'id' => "$channel_id.$d", 'url' => \@urls ); if ($ch->{'logo'}) { $channel{'icon'} = [ { src => $ch->{'logo'} } ]; } $CHANNELS{$channel_id} = \%channel; } die "no CHANNELS could be found" if not %CHANNELS; update $bar if not $opt_quiet; $bar->finish() if not $opt_quiet; t "CHANNELS:" . d \%CHANNELS; } #------------------------------------------------------------------------------- # add_person #------------------------------------------------------------------------------- # desc : check and maybe add the person to the credits # arguments : 1- found hungarian/roumanian jobname on the HTML page # 2- name of the person # 3- reference to the global creadits hash # returns : none #------------------------------------------------------------------------------- sub add_person ( $$$ ) { my ($job, $person, $rcredits) = @_; $person =~ s/\s+/ /g; return unless length($person); # suppress if job is not known, or if not mapped to DTD if (defined $JOBMAP{$job} && length($JOBMAP{$job})) { push @{$$rcredits{$JOBMAP{$job}}}, $person; t "credits: added: '$job -> $person'"; } else { t "credits: NOT added: '$job -> $person'"; } } #------------------------------------------------------------------------------- # get_infourl_data #------------------------------------------------------------------------------- # desc : merge data from linked info page into programme hash # arguments : 1- reference to the program, whom detailed descr should be grabbed # 2- url to fetch # returns : none #------------------------------------------------------------------------------- sub get_infourl_data( $$ ) { my $prog = shift; my $d = domain(); my $url = shift; # add port.hu/port.ro base url only if url is not contains the "://" uri separator if (! ($url =~ "://")) { $url = "https://www.$d" . $url; } # no info, so don't add it to anywhere # -> calendar.event_popup if ($url =~ "calendar\.event_popup") { t "SKIP fetching of slow url: $url"; return; } # do not grab: # -> pictures: ... pls/me/picture.popup?i_area_id # -> dvd rent links page: ... pls/w/logging.page_log?i_page_id=20... # -> sample movie ... video.link_popup?i_object_id=18822 # -> dvd sales page: www.divido.hu... # -> bet on a sport event -> sprotingbet # -> general advert links: adverticum if ($url =~ "(picture.popup|logging.page_log|video.link_popup|www\.divido\.hu|sportingbet|adverticum\.net)") { # add this url to the program push @{$prog->{q(url)}}, $url; t "SKIP fetching of slow url: $url"; return; } t "fetching slow url" . d $url; worker("slow-downloading"); t "fetching $url..."; $XMLTV::Get_nice::FailOnError = 0; my $data; if (! defined($data = get_nice($url))) { worker("slow-parsing"); warn "Could not get URL: $url, the detailed description for the program [" . $prog->{channel} . ", " . $prog->{title}[0][0] . ", " . $prog->{start} . "] will be not available. Error message: " . error_msg($url) . "." ; return; } my $tree = HTML::TreeBuilder->new_from_content($data) or die "could not fetch/parse $url (infopage)"; worker("slow-parsing"); my (@lines, $line, $anchor, $left, $right, $parent, $elem, $joined); # SUBTITLE # anchor point: # the title will be tagged aw follows: # title # these siblings buld the subtitle, until a table follows... e.g.: #
... subtitle line 1 ...
# ...subtitle line 2 ...

# look_down(_tag=>"span", class=>"blackbigtitle"); ($anchor) = $tree->look_down(_tag=>"h1", class=>"blackbigtitle") if !defined $anchor; # 2014-04-09 it seems to now be in

if ($anchor) { $elem = $anchor; my ($engtitle, @tmp); while (($elem = $elem->right()) && ((ref $elem) && ($elem->tag() ne "table"))) { # if a whole line is surrounded with parentheses, on port.hu # this is the program's english title, add this as title and # also as subtitle: because some viewer shows only infos of # selected language (so the english title will be not # visible otherwise just in sub-title) @tmp = get_all_text($elem); push @lines, @tmp; $line = join(' ', @tmp); if (($engtitle) = $line =~ m/^\s*\(([^\)]+)\)\s*$/) { push @{$prog->{q(title)}}, [$engtitle, 'en']; t "engtitle added: $engtitle"; } } $joined = join(", ", @lines); $joined =~ s/\xA0//; # remove the to_text()'s results of   $joined =~ s/^\s+//; # remove blanks $joined =~ s/\s+$//; # remove blanks $joined =~ s/,$//; # remove trailing comma t "anchor and right sibling found, joinedlines parsed :'$joined'"; $prog->{q(sub-title)} = [[$joined, $COUNTRY]] if length($joined); } # ICON # anchor point: # the programme image will be tagged as follows: #
Închisoarea îngerilor - Tim Robbins t "programme icon parsing ..."; $anchor = $tree->look_down(_tag=>"div", class=>"random-media-wrapper"); $anchor = $anchor->look_down(_tag=>"img", class=>"object_picture") if $anchor; if ($anchor) { my %icon; $icon{'src'} = $anchor->attr('src') if $anchor->attr('src'); $icon{'width'} = $anchor->attr('width') if $anchor->attr('width'); $icon{'height'} = $anchor->attr('height') if $anchor->attr('height'); $prog->{q(icon)} = [ \%icon ] if $anchor->attr('src'); } # LINKS: # try to grab IMDB, All Movie, official web site of the program # anchor point: # (the Links are listed between dots, but it is not allways the 5th) # (dots block, because not all blocks presented allways, so this is not) # (suggested to use) the Links are listed after the text Linkek(hu) # or Linkuri(ro), some line # after come a 'dots' (which is in a TABLE element, so: # find a span element with Linkek(hu) or Linkuri(ro) contents, get all # A element until TABLE not reached t "links parsing ..."; $anchor = undef; my @spans = $tree->look_down(_tag => "span"); t "spans found: " . $#spans; foreach (@spans) { if ($_->as_text() =~ /$WORDS{$COUNTRY}->{links}/) { t "anchor point found"; $anchor = $_; last; } } my @links; if ($anchor) { $elem = $anchor; while (($elem = $elem->right()) && ((ref $elem) && ($elem->tag() ne "table"))) { foreach ($elem->find_by_tag_name("_tag"=>"a")) { # is this not begins with 'https?://' add prefix push @links, ($_->attr(q(href)) =~ /^https?:\/\// ? "" : "http://www.$d") . $_->attr(q(href)); t "link url added: " . $_->attr(q(href)) ; } } } push @links, $url; if (defined $prog->{q(url)}) { @{$prog->{q(url)}} = ( @links, @{$prog->{q(url)}} ); } else { push @{$prog->{q(url)}}, @links; } # LONG DESCRIPTION: # new format uses the
block # to separate contents # anchor point: # long desc is in the 3. block; this is right sibling of the 3rd separator # the actual content is inbetween the .... elements t "long desc parsing ..."; my @separators = $tree->look_down(_tag=>"div", class=>"separator"); return if ($#separators < 2); @lines = (); if (($anchor) = $separators[2]->right()) { $joined = $anchor->tag(); if ($anchor->tag() eq "span" && $anchor->attr('class') eq "txt") { push @lines, get_all_text($anchor); $joined = join(" ", @lines); $joined =~ s/\xA0//; # remove the to_text()'s results of   $joined =~ s/^\s+//; # remove blanks t "found description: $joined"; if (length($joined)) { delete($prog->{q(desc)}); # strip the desc at the specified command line option (if spec) if (defined ($opt_max_desc_length) && ($opt_max_desc_length < length($joined))) { t "long desc was stripped, at: $opt_max_desc_length."; $joined = substr($joined, 0, $opt_max_desc_length - 3) . "..."; } $prog->{q(desc)} = [[ $joined, $COUNTRY ]] } } } # SERIES NUMBER, CATEGORY, YEAR # anchor point: 2nd separator # all text data is in/under the parent TD element of the 2nd separator # We collect all text data, and parse it from known datas. return if ($#separators < 1); ($anchor) = $separators[1]; if ($anchor->parent()->tag() ne "td" || !defined $anchor->parent->attr('width') || $anchor->parent->attr('width') ne "98%") { # bug #445 t "credits section not found"; return; } # get the rating if available # (AP) if (my $img = $anchor->parent()->look_down('_tag' => 'img', 'class' => 'age_limit_icon')) { my $rating = $img->attr('alt'); $rating =~ s/[\(\)]//g; # strip the brackets my $rating_icon = $img->attr('src'); $prog->{q(rating)} = [[ $rating, '', [{'src' => $rating_icon }] ]]; } # unfortunately the star-rating (vote_box) if fetched with an AJAX call # e.g. http://www.port.sk/arrow/pls/fi/vote.print_vote_box?i_object_id=139692&i_area_id=6&i_reload_container=id%3D%22vote_box%22&i_is_separator=0 # # we could use TreeBuilder->store_cdata(true) to store the cdata under the root node, but I think it's easier to just regexp the html # /*new_from_content($ajaxdata) or die "could not fetch/parse $ajaxurl (infopage)"; worker("ajax-parsing"); # get the "star-rating" # The rating includes the number of votes. Testing for statistical significance calculates that, at # a confidence level of .95, a population of 100 will give 70% confidence in the score being accurate # (and is largely independent of population size). # Therefore only output the rating where the population is > 100. # if (my $anchor = $ajaxtree->look_down(_tag=>"div", class=>qr/starholder/)) { my $starsval; if (my $stars = $anchor->look_down(_tag=>"span", class=>"ctxt")) { $starsval = $stars->as_text(); $starsval =~ s/,/./; $starsval += 0; # convert to float } if (my $votes = $anchor->look_down(_tag=>"div", class=>"votenum")) { my $votesval = $votes->as_text(); if ($votesval) { (my $num_votes) = $votesval =~ /(\d*)/; # if number of votes is >100 then output the star-rating if ($num_votes ne '' && int($num_votes) > 100) { $prog->{q(star-rating)} = [[ sprintf("%.0f / 10", $starsval), "uservotes" ]]; } } } } } # restore the previous values $XMLTV::Get_nice::FailOnError = $get_fail; $XMLTV::Get_nice::Delay = $get_delay; } # collect all text lines, we # achive this to jump to the parent first, and walk all the childs until # the anchor is reached @lines = (); foreach $elem ($anchor->parent()->content_list()) { last if ((ref $elem) && ($elem == $anchor)); push @lines, get_all_text($elem); } # 0:{we are in credits secton}, 1:{duration,year section} my $section = 0; my $job = "foobar"; my $part = ""; my (%credits, $episode, $minutes, $year); my $person = ""; foreach $line (@lines) { $line =~ s/\xA0//; # remove to_text()'s results of &npsp t "processing line: '" . d $line . "'"; foreach $part (split /, */, $line) { $part =~ s/^\s+//; # remove heading blanks $part =~ s/\s+$//; # remove ending blanks $part =~ s/^,*$//; next unless length $part; t "processing part: '$part'"; # we are in credits block if a known hungarian "job:" found $section = 1 if (($section == 0) && (($_) = $part =~ m/\b(.+):/) && (defined($JOBMAP{$_}))); if ($section == 0) { # duration, year, category # possibilitys # 1: amerikai filmdráma sorozat, 90 perc, 2000, 2. rész # 12 éven aluliak számára .... # added 2004-04-07 : # (ro) Coreea de Sud, 2009, serial de aventuri, episodul 5 $_ = $part; SWITCH: { if ((m/\s*([0-9\/]+)\. $WORDS{$COUNTRY}->{episode}/) && (! defined $episode)) { $episode = $1; last SWITCH;} if ((m/$WORDS{$COUNTRY}->{episode} \s*([0-9\/]+)/) && (! defined $episode)) { $episode = $1; last SWITCH;} if ((m/\s*(\d+) $WORDS{$COUNTRY}->{minute}/) && (! defined $minutes)) { $minutes = $1; last SWITCH;} if ((m/\s*([12][0-9]{3})/) && (! defined $year)) { $year = $1; last SWITCH;} { ; } # default -> category, was processed over } t "found episode: '$episode'" if defined $episode; t "found minutes: '$minutes'" if defined $minutes; t "found year: '$year'" if defined $year; } # section 0 if ($section == 1) { # # is there a "hu-job:" string in the part? if yes, we should # push the last readed person, and clear the person string. # if a job is defined (hu-job) but not supported in the DTD # we will add # the person(s) as: # some_job: Foo Bar, Dummy Name, ... # note: \b(.+): do not match to " író: ", because í is not # part of \b # bug #451 [line deleted] if (($_) = $part =~ /^\s*(\S+):/) { # does the $line include a ':' # remove the "jobname:" string $part =~ s/^\s*(\S+):\s*//; t "assuming string is a jobname"; # this means, we should add our until now collected # person to the credits, and begin to collect new # actors... add_person($job, $person, \%credits); t "is this a known job?: '$_'"; # e.g.: hu-job if (defined $JOBMAP{lc($_)} && length($JOBMAP{lc($_)})) { # newly readed part has a en-job (this is defined in DTD, so # this will be the next used job for XML generation t "job known in DTD as: $JOBMAP{lc($_)}"; $job = lc($_); $person = $part; } #en-job else { # this job is not known in DTD, so only en-job, no hu-job; # add as descriped above, set job to foobar to add as actor $job = "foobar"; $person = "$_: $part "; } #hu-job next; } #: in the part # we are here, if: # -> $part holds ':' but it is no hu-job (no en-job) # -> it have no : if ($part =~ /^\(.*\)$/) { t "found () expression, addint it to person string"; # if it has the from '(...)' the found HTML was: # actor: Arnold Schweizenegger (as the Terminator) # add this to persons and do not push, it. $person .= " $part"; } else { # this is a new name, check how looks person, if it ends # with ":" do not add this to credits, only append, because in the # previuos iteration only hu-job was found. if ($person =~ /:\s*$/) { $person .= " $part"; } else { add_person($job, $person, \%credits); $person = $part; } } } #section 1 } #loop over parts } #loop over $lines # add the last processed data to credits... add_person($job, $person, \%credits) if length($person); t "CREDITS: " . d \%credits; $prog->{q(credits)} = \%credits; #$prog->{q(category)} = [[ $category, $COUNTRY ]] #if defined $category and length $category; $prog->{q(length)} = $minutes * 60 if defined $minutes; $prog->{q(date)} = $year if defined $year ; $prog->{q(episode-num)} = extract_episode( $episode ) if defined $episode ; $tree->delete; } #------------------------------------------------------------------------------- # get_infourl_data_json #------------------------------------------------------------------------------- # desc : merge data from linked info page into programme hash # arguments : 1- reference to the program, whom detailed descr should be grabbed # 2- url to fetch # returns : none #------------------------------------------------------------------------------- sub get_infourl_data_json( $$ ) { my $prog = shift; my $d = domain(); my $url = shift; # add port.hu/port.ro base url only if url is not contains the "://" uri separator if (! ($url =~ "://")) { $url = "https://www.$d" . $url; } # no info, so don't add it to anywhere # -> calendar.event_popup if ($url =~ "calendar\.event_popup") { t "SKIP fetching of slow url: $url"; return; } # do not grab: # -> pictures: ... pls/me/picture.popup?i_area_id # -> dvd rent links page: ... pls/w/logging.page_log?i_page_id=20... # -> sample movie ... video.link_popup?i_object_id=18822 # -> dvd sales page: www.divido.hu... # -> bet on a sport event -> sprotingbet # -> general advert links: adverticum if ($url =~ "(picture.popup|logging.page_log|video.link_popup|www\.divido\.hu|sportingbet|adverticum\.net)") { # add this url to the program push @{$prog->{q(url)}}, $url; t "SKIP fetching of slow url: $url"; return; } t "fetching slow url" . d $url; worker("slow-downloading"); t "fetching $url..."; $XMLTV::Get_nice::FailOnError = 0; my $data; if (! defined($data = get_nice($url))) { worker("slow-parsing"); warn "Could not get URL: $url, the detailed description for the program [" . $prog->{channel} . ", " . $prog->{title} . ", " . $prog->{start} . "] will be not available. Error message: " . error_msg($url) . "." ; return; } else { if ($data =~ //) { my $orig_title = substr($data, index($data, '')+14, index($data, '', index($data, ''))-(index($data, '')+14)); $orig_title =~ s/<[^>]+>//g; $orig_title =~ s/^\s+|\s+$//g; # trim spaces $orig_title =~ s/^[^\n]+\n//g; # remove translated title $orig_title =~ s/^[^\/]+\/(.*)\/$/($1)/g; $orig_title = encode($DEFAULT_ENCODING, decode('utf-8', $orig_title)) if ($DEFAULT_ENCODING !~ /utf\-?8/i); $prog->{q(desc)} = (defined($prog->{q(desc)}) && $prog->{q(desc)} ne "") ? $prog->{q(desc)}.' '.$orig_title : $orig_title; } if ($data =~ //) { my $sum = substr($data, index($data, '')+16, index($data, '', index($data, ''))-(index($data, '')+16)); $sum =~ s/<[^>]+>//g; $sum =~ s/^\s+|\s+$//g; # trim spaces $sum =~ s/magyarul\ besz..l..\,\ //g; $sum = encode($DEFAULT_ENCODING, decode('utf-8', $sum)) if ($DEFAULT_ENCODING !~ /utf\-?8/i); $prog->{q(desc)} = (defined($prog->{q(desc)}) && $prog->{q(desc)} ne "") ? $prog->{q(desc)}.' '.$sum : $sum; } if ($data =~ //) { $data = substr($data, index($data, '
')+25, index($data, '
', index($data, '
'))-(index($data, '
')+25)); $data =~ s/<\/?article>//ig; $data =~ s// /ig; $data =~ s/[^\<]+<\/strong>//ig; # Feliratozva a teletext ... $data =~ s/.*//ig; # Forgalmazó: ... / Bemutató dátuma: ... $data =~ s/^\s+|\s+$//g; # trim spaces $data = encode($DEFAULT_ENCODING, decode('utf-8', $data)) if ($DEFAULT_ENCODING !~ /utf\-?8/i); $prog->{q(desc)} = (defined($prog->{q(desc)}) && $prog->{q(desc)} ne "") ? $prog->{q(desc)}.', '.$data : $data; } worker("slow-parsing"); } } #------------------------------------------------------------------------------- # extract_episode #------------------------------------------------------------------------------- # desc : parse text containing the episode details # arguments : 1- episode data # returns : xmltv episode-num definition #------------------------------------------------------------------------------- sub extract_episode( $ ) { my $episode = shift; my ($episode_num, $season); if(defined($episode)) { if($episode =~ m#(\d+)/(\d+)#) { # episode-num spec with the total number specified. # swap numbers for port.hu, they have total/num my ($num, $total) = ($1, $2); ($num, $total) = ($2, $1) if ($num > $total); # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('. %d/%d .', $num - 1, $total), "xmltv_ns" ], [ $episode, "onscreen" ]]; } elsif($episode =~ m#([IVX]+)\./(\d+)#) { # patch #80 # port.hu style episode numbering: ./. e.g. V./3 # episode-num spec with the total number specified. # decode season from roman numeral $season = arabic ($1); # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('%d . %d .', $season - 1, $2 - 1), "xmltv_ns" ], [ $episode, "onscreen" ]]; } elsif($episode =~ m#(\d+), ([IVX]+)#) { # episode-num spec with the total number specified. # decode season from roman numeral $season = arabic ($2); # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('%d . %d .', $season - 1, $1 - 1), "xmltv_ns" ], [ $episode, "onscreen" ]]; } elsif($episode =~ m#(\d+)#) { # episode-num spec with just the episode number # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('. %d .', $1 - 1), "xmltv_ns" ], [ $episode, "onscreen" ]]; } else { $episode_num = [[ $episode, "onscreen" ]]; } } return $episode_num; } #------------------------------------------------------------------------------- # grab_icon #------------------------------------------------------------------------------- # desc : fetch (if needed and specified) channel icons, returns pointing URL # arguments : 1- channel id (eg 003) # returns : url pointing to tha program's logo (icon) http:|file:... #------------------------------------------------------------------------------- sub grab_icon( $ ) { # if icon not requested return unless ($opt_icons || $opt_local_icons); my $channelid = shift; my $fetchurl = "https://www." . domain() . "/tv/kep_ado/al_".(int(${channelid}) % 10000).".gif"; my ($file, $iconurl); # that $fetchurl no longer works for RO, so... #test if url is valid $XMLTV::Get_nice::FailOnError = 0; my $image = get_nice($fetchurl); if (!defined $image) { # image url not valid, so we must get it from the programmes page. Ideally we would do that during the main grab but this is a Q&D fix # and I don't want to change too much of this code my $url = "https://www." . domain() . "/pls/w/".($COUNTRY eq 'hu' || $COUNTRY eq 'ro' ? 'old' : '')."tv.channel?i_ch=".$channelid."&i_date=".UnixDate('today','%Y-%m-%d')."&i_where=1"; # bug #501 my $data=get_nice($url); my $tree = HTML::TreeBuilder->new_from_content($data) or die "could not fetch/parse $url (grab_icon)\n"; worker("base-parsing"); my $body = $tree->look_down("_tag"=>"body"); my $container = $body->look_down("_tag" => "div", "class" => qr/main-container-100/); if ($container) { if (my $imgdiv = $container->look_down("_tag" => "div", "style" => qr/float\s*:\s*left/, sub { my $imgtag = $_[0]->look_down('_tag' => 'img'); return 0 unless $imgtag; return $imgtag->attr('src') =~ m/https:\/\/media/; } )) { $fetchurl = $imgdiv->look_down('_tag' => 'img')->attr('src'); } } } $XMLTV::Get_nice::FailOnError = 1; return $fetchurl if ($opt_icons && ! $opt_local_icons); # create directory mkdir $opt_local_icons unless (-d $opt_local_icons); # remove multiple /; make absoluth path $_ = "${opt_local_icons}/${channelid}.gif"; s!//!/!g; $file = Cwd::abs_path( $_ ); $iconurl = "file://${file}"; return $iconurl if ($opt_local_icons && $opt_no_fetch_icons); if (! -d $opt_local_icons) { warn "directory not exists, and cannot create: $opt_local_icons; " . "icon will be not grabbed"; return $fetchurl; } if (open(FILE,">$file")) { t "fetching $fetchurl..."; $XMLTV::Get_nice::FailOnError = 0; #if (my $image = get_nice($fetchurl)) { # now grabbed above if (!$image) { $image = get_nice($fetchurl); } if ($image) { t "icon for $channelid grabbed successfully"; print FILE $image; close FILE; # success return $iconurl; } else { warn "Could not download channel-logo for channel $channelid, using remote URL instead. " . "Error message: " . error_msg($fetchurl) . "."; close FILE; unlink $file; return $fetchurl; } } else { warn "cannot create icon file ($file) for channel $channelid, using remote URL instead"; close FILE; unlink $file; return $fetchurl; } return; } #------------------------------------------------------------------------------- # get_channel_urls #------------------------------------------------------------------------------- # desc : grab a channel page fetch (if needed and specified) channel icons, returns pointing URL # arguments : 1- channel id (eg 003) (grab a webpage parse data form there) # OR # 2- reference to a HTML tree's (root) object (searching in it) # returns : array of urls pointing to tha channel's pages/emails #------------------------------------------------------------------------------- sub get_channel_urls( $ ) { my $ch_did = shift; my @result = (); my $chdata; # two sprintf parameters: first: channel_id,, second how many days grabbed my $churlfmt = "https://www." . domain() . "/pls/tv/".($COUNTRY eq 'hu' || $COUNTRY eq 'ro' ? 'old' : '')."tv.channel?i_ch=%d&" . "i_days=1&i_xday=%d&i_where=1"; # bug #501 # url to grab now (4 days - this is the minimum) my $churl = sprintf($churlfmt, $ch_did, 4); # url to add as the information source (4 days - this is the minimum) my $portchurl = sprintf($churlfmt, $ch_did, 4); t "fetching page for channel urls: $churl\n"; worker("base-downloading"); t "fetching $churl..."; $XMLTV::Get_nice::FailOnError = 0; if (! defined($chdata = get_nice($churl))) { worker("base-parsing"); warn "Could not get URL: $churl, the information urls for the channel $ch_did will be not available. " . "Error message: " . error_msg($churl) . "."; push @result, $portchurl; return @result; } my $tree = HTML::TreeBuilder->new_from_content($chdata) or die "could not fetch/parse $churl (channel infopage)"; worker("base-parsing"); my ($anchor, $elem); # we have to way to find the channel URLs: # -> find the channel image (this is in the same TABLE element as the # requested A elements, and if this is not found: # -> try to find a HR element (only one is presented on the page), this # Nth left sibling is the searched TABLE. if (($anchor) = $tree->look_down( _tag => "b", sub { lc($_[0]->as_text()) =~ /web:/ } ) ) { if ($anchor = ($anchor->look_up("_tag"=>"p")->look_down("_tag"=>"a"))) { push @result, $anchor->attr(q(href)); } else { t "channel url not found"; } } else { t "channel url not found"; } # add PORT url, too, this should be the last (and open 3 days if clicked) push @result, $portchurl if defined $portchurl; return @result; } #------------------------------------------------------------------------------- # load_configs #------------------------------------------------------------------------------- # desc : load the tv_grab_huro.conf, jobmap, catmap.$COUNTRY files, and # sets the globals: %CATMAP, %JOBMAP # arguments : none # returns : array of port channel ids: ( 001, 005 ) #------------------------------------------------------------------------------- sub load_configs() { my @config_lines = XMLTV::Config_file::read_lines($CONFIG_FILE); my $line_num = 0; my (@portids, $where, @fields); foreach (@config_lines) { ++ $line_num; next if not defined; $where = "$CONFIG_FILE:$line_num"; if (/^country:?\s+(\w\w)$/) { warn "$where: already seen country\n" if defined $COUNTRY; $COUNTRY = $1; } elsif (/^channel:?\s+(\S+)\s+([^\#]+)/) { my $ch_did = $1; my $ch_name = $2; $ch_name =~ s/\s*$//; push @portids, $ch_did; # FIXME do not store display-name in the config file - it is # ignored here. } else { warn "$CONFIG_FILE:$.: bad line\n"; } } for ($COUNTRY) { if (not defined) { $_ = 'hu'; warn "country not seen in $CONFIG_FILE, assuming '$_'\n"; } } # Lame reverse lookup on %COUNTRIES. foreach (values %COUNTRIES) { if ($_->[0] eq $COUNTRY) { $TZ = $_->[1]; last; } } die "$where: unknown country $COUNTRY\n" if not defined $TZ; # jobmap file # (this is a file, where we store translations of job names from # Hungarian or Romanian language to English. However we leave some # translations blank, namely these that have no field in the credits # structure) # # Read the file with channel mappings. my $jobmap_file = "jobmap"; my $jobmap_str = GetSupplement( 'tv_grab_huro', $jobmap_file ); $line_num = 0; foreach (split( /\n/, $jobmap_str )) { ++ $line_num; tr/\r//d; s/#.*//; next if m/^\s*$/; s/^\s+|\s+$//g; # trim spaces $where = "$jobmap_file:$line_num"; @fields = split m/:/; die "$where: wrong number of fields" if @fields > 2; my ($huro_job, $credits_id) = @fields; $JOBMAP{$huro_job} = defined($credits_id) ? $credits_id : ""; } # read the file with category mappings. # cat_en:cat_hu:regexp my $catmap_file = "catmap.$COUNTRY"; my $catmap_str=""; $catmap_str = GetSupplement( 'tv_grab_huro', $catmap_file ); $line_num = 0; foreach (split( /\n/, $catmap_str )) { ++ $line_num; tr/\r//d; s/#.*//; next if m/^\s*$/; s/^\s+|\s+$//g; # trim spaces $where = "$catmap_file:$line_num"; @fields = split m/:/; die "$where: wrong number of fields" if @fields > 3; my ($cat_en, $cat_hu, $cat_reg) = @fields; $CATMAP{$cat_en} = defined($cat_reg) ? [$cat_reg, $cat_hu] : [$cat_hu, $cat_hu]; } return @portids; } #------------------------------------------------------------------------------- # worker #------------------------------------------------------------------------------- # desc : measure how many seconds will be executed some port of this program # arguments : 1- name of the worker part of this program, currently: # xml-writing, base-downloading, slow-downloading # base-parsing, slow-parsing # returns : none #------------------------------------------------------------------------------- sub worker( $ ) { my $now = time(); my $newworker = shift; if (! defined $WNAME) { $WNAME = $newworker; $WTIMES{$WNAME} = 0; $WSTIME = $now; return; } $WTIMES{$WNAME} += $now - $WSTIME; $WSTIME = $now; $WNAME = $newworker; } #------------------------------------------------------------------------------- # showworkers #------------------------------------------------------------------------------- # desc : prints $WTIMES to the stdout # arguments : none # returns : none #------------------------------------------------------------------------------- sub showworkers() { return if $opt_quiet; return if not $opt_worker_times; my $total = 0; $total += $_ foreach values %WTIMES; $total = 1 unless $total ; # division by zero printf STDERR ("%-20s: %3d:%02dm %3d%%\n", $_, $WTIMES{$_} / 60, $WTIMES{$_} % 60, 100 * $WTIMES{$_} / $total) foreach keys %WTIMES; printf STDERR ("%-20s: %3d:%02dm\n", "total", $total / 60, $total % 60); } #------------------------------------------------------------------------------- # arabic #------------------------------------------------------------------------------- # desc : This example uses Robin Houston's entry in the Perl # : Institute's Roman Numeral Challenge to convert Roman numerals # : to Arabic. http://www.perl.org/wits.html # : (we could just use Roman, but don't want to add the dependency) # arguments : roman number # returns : integer #------------------------------------------------------------------------------- sub arabic( $ ) { my ($n, $d); ($n, $d, $_) = (1, 2, @_); $_ = uc $_ if !/[^a-z]/; for my $v(split//, 'IVXLCDM') { s/\+.*$v/)/; s/$v([^$v+-])/-$n$1/g; s/$v/+$n/g; $n *= $d ^= 7 } /[^-+\d]/ ? () : eval } #------------------------------------------------------------------------------- # M A I N #------------------------------------------------------------------------------- # Whether zero-length programmes should be included in the output. my $WRITE_ZERO_LENGTH = 0; # Get options, including undocumented --cache option. XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); $opt_slow = 0; $opt_full_desc = 0; $opt_days = 8; # default $opt_offset = 0; # default $opt_quiet = 0; # default GetOptions( 'days=i' => \$opt_days, 'offset=i' => \$opt_offset, 'help' => \$opt_help, 'configure' => \$opt_configure, 'gui:s' => \$opt_gui, 'config-file=s' => \$opt_config_file, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, 'slow' => \$opt_slow, 'list-channels' => \$opt_list_channels, 'icons' => \$opt_icons, 'local-icons=s' => \$opt_local_icons, 'no-fetch-icons'=> \$opt_no_fetch_icons, 'loc=s' => \$opt_loc, 'now=s' => \$opt_now, 'worker-times' => \$opt_worker_times, 'get-full-description' => \$opt_full_desc, 'max-desc-length=i' => \$opt_max_desc_length ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); usage(1) if $opt_help; my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', $opt_list_channels => 'list-channels'); XMLTV::Ask::init($opt_gui); # File that stores which channels to download. $CONFIG_FILE = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_huro', $opt_quiet, 'tv_grab_hu'); #------------------------------------------------------------------------------- # only configuration #------------------------------------------------------------------------------- if ($mode eq 'configure') { worker("base-parsing"); XMLTV::Config_file::check_no_overwrite($CONFIG_FILE); open(CONF, ">$CONFIG_FILE") or die "cannot write to $CONFIG_FILE: $!"; my $default_cn = 'Hungary'; my $cn = ask_choice('Grab listings for which country?', $default_cn, sort keys %COUNTRIES); $COUNTRY = $COUNTRIES{$cn}[0]; print CONF "country $COUNTRY\t# $cn\n"; # Ask about each channel. (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? get_channels_json() : get_channels; # sets %CHANNELS my @portids = sort keys %CHANNELS; my @names = map { $CHANNELS{$_}->{qw(display-name)}->[0][0] } @portids; my @qs = map { "add channel $_?" } @names; my @want = ask_many_boolean(1, @qs); foreach (@portids) { my $w = shift @want; warn("cannot read input, stopping channel questions"), last if not defined $w; # No need to print to user - XMLTV::Ask is verbose enough. # Print a config line, but comment it out if channel not wanted. print CONF '#' if not $w; my $name = shift @names; print CONF "channel $_ $name\n"; # TODO don't store display-name in config file. } close CONF or warn "cannot close $CONFIG_FILE: $!"; say("Finished configuration."); worker("base-parsing"); showworkers(); exit(); } # Options to be used for XMLTV::Writer. my %w_args; $w_args{encoding} = $DEFAULT_ENCODING; if (defined $opt_output) { my $fh = new IO::File(">$opt_output"); die "cannot write to $opt_output: $!" if not defined $fh; $w_args{OUTPUT} = $fh; } #------------------------------------------------------------------------------- # only channel listing #------------------------------------------------------------------------------- if ($mode eq 'list-channels') { # Write channels mode. worker("base-parsing"); if (not defined $opt_loc) { my $msg = "--loc option required with --list-channels:\n"; foreach (sort keys %COUNTRIES) { $msg .= " --loc $COUNTRIES{$_}[0] for $_\n"; } die $msg; } $COUNTRY=$opt_loc; worker("xml-writing"); my $writer = new XMLTV::Writer(%w_args); $writer->start(xhead()); worker("base-parsing"); (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? get_channels_json() : get_channels(); # sets %CHANNELS # sort channels based on their portid my @portids = sort keys %CHANNELS; worker("xml-writing"); $writer->write_channel($CHANNELS{$_}) foreach @portids; $writer->end(); worker("base-parsing"); showworkers(); exit(); } #------------------------------------------------------------------------------- # only grabbing #------------------------------------------------------------------------------- if ($mode eq 'grab') { worker("base-parsing"); my $ch_did; my $bar; my @portids = load_configs(); # sets %CHANNELS (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? get_channels_json($mode) : get_channels($mode); worker("xml-writing"); my $writer = new XMLTV::Writer(%w_args); worker("base-parsing"); # we have to fetch @portids icons, and @portids pages for channel URL # (e.g.: www.hbo.hu) $bar = new XMLTV::ProgressBar('getting channel details ', 2 * @portids) if not $opt_quiet; worker("xml-writing"); $writer->start(xhead()); worker("base-parsing"); # Write channel elements foreach $ch_did (@portids) { if (! $CHANNELS{$ch_did}) { warn "\nWARNING: Channel with port-id $ch_did no more exists on the site, skipping it's channel description grabbing!"; next; } my %channel = %{$CHANNELS{$ch_did}}; worker("base-downloading"); # fetch and get icon url if (my $iconurl = grab_icon( $ch_did )) { $channel{'icon'} = [ { src => $iconurl } ]; } update $bar if not $opt_quiet; worker("base-parsing"); if (($COUNTRY) && ($COUNTRY ne 'hu') && ($COUNTRY ne 'ro') && (my @churls = get_channel_urls( $ch_did ))) { $channel{'url'} = \@churls; } update $bar if not $opt_quiet; worker("xml-writing"); $writer->write_channel(\%channel); worker("base-parsing"); } $bar->finish() if not $opt_quiet; if (!defined($COUNTRY) || (($COUNTRY ne 'hu') && ($COUNTRY ne 'ro'))) { # old, HTML based pages # The grabber's source allows requests of more than one day per page. This can # be done by specifying the i_xday argument with the GET request. # # To not load their server too much (requesting e.g. 14 channels in one shot # should 'cause quite some traffic to the SQL server) I think we shouldn't # query for more then 5 channels per page. With the default of requesting data # for 8 days this leads to 2 requests per channel and grab ... $DAYSPERPAGE = int($opt_days / 5) + (($opt_days % 5) ? 1 : 0); $DAYSPERPAGE = int($opt_days / $DAYSPERPAGE); # We have to request at minimum of four days $DAYSPERPAGE = 4 if ($DAYSPERPAGE<4); } else { # JSON $DAYSPERPAGE = 1; } t "requesting $DAYSPERPAGE days per scraped webpage ..."; # port.hu|ro provide the today's program based on the localtime on # Hungary. So in other lands e.g. Australia (thx Zsolt Bayer) (TZ: EST/AEST) if # there is f.e. friday 22:38 here in Hungary it is saturday 04:38 # so Zsolt will get the programs not for the requested day (the XML will be # correct, just the wrong day is in) # # we cannot use Date::Manip's Date_ConvTZ, because it does not detects # correctly f.e. the Australia/Melbourne zone. (because it uses `date +%Z` # to get the zone, and date will output EST and not AEST :-(). # [we could not use f.e. `date +%z`, becuase what happen on windows?] # # that means: we will here not set the global FETCHOFFSET to fetch # the "today's" program from everywhere on the world, but we will grab # at first 3 pages (0, -1, +1) to find the correct offset. my $now = parse_date("now"); # developer's options --now: what time is it? (measured in local time) $now = parse_date( $opt_now ) if ($opt_now); t "now=$now"; my $startat = DateCalc($now, "$opt_offset days"); my $startatdate = UnixDate($startat, '%Q'); t "start grabbing from (offset added, localtime): $startatdate"; # make list: which date is which day on the website, we will make grabbing # requests based on the @days array my @days; for (my $i = 1 + $opt_offset; $i < 1 + $opt_offset + $opt_days; $i += $DAYSPERPAGE) { push @days, [ $startatdate, $i ]; # calculate the next date: bump a YYYYMMDD date by $DAYSPERPAGE day $startatdate = UnixDate(DateCalc(parse_date($startatdate), "+ $DAYSPERPAGE days"), '%Q'); die "Could not calculate next grabbing date $days[$#days][0] (+$DAYSPERPAGE days)" if not defined $startatdate; } # This progress bar is for both downloading and parsing. Maybe # they could be separate stages. $bar = new XMLTV::ProgressBar('getting program listings', @days * @portids) if not $opt_quiet; foreach my $date_n_day (@days) { my ($idate, $iday) = @$date_n_day; my $some_success = 0; foreach $ch_did (@portids) { if (! $CHANNELS{$ch_did}) { warn "\nWARNING: Channel with port-id $ch_did no more exists on the site, skipping it's program grabbing!"; next; } my @ps = (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? process_json($idate, xid($ch_did), $ch_did, $iday) : process_table($idate, xid($ch_did), $ch_did, $iday); $some_success = 1 if @ps; worker("xml-writing"); $writer->write_programme($_) foreach @ps; worker("base-parsing"); update $bar if not $opt_quiet; } if (@portids and not $some_success) { warn "failed to get any listings for day $iday, stopping\n"; last; } } $bar->finish() if not $opt_quiet; worker("xml-writing"); $writer->end(); worker("base-parsing"); showworkers(); exit(0); } die;