t "programme icon parsing ..."; $anchor = $tree->look_down(_tag=>"div", class=>"random-media-wrapper"); $anchor = $anchor->look_down(_tag=>"img", class=>"object_picture") if $anchor; if ($anchor) { my %icon; $icon{'src'} = $anchor->attr('src') if $anchor->attr('src'); $icon{'width'} = $anchor->attr('width') if $anchor->attr('width'); $icon{'height'} = $anchor->attr('height') if $anchor->attr('height'); $prog->{q(icon)} = [ \%icon ] if $anchor->attr('src'); } # LINKS: # try to grab IMDB, All Movie, official web site of the program # anchor point: # (the Links are listed between dots, but it is not allways the 5th) # (dots block, because not all blocks presented allways, so this is not) # (suggested to use) the Links are listed after the text Linkek(hu) # or Linkuri(ro), some line # after come a 'dots' (which is in a TABLE element, so: # find a span element with Linkek(hu) or Linkuri(ro) contents, get all # A element until TABLE not reached t "links parsing ..."; $anchor = undef; my @spans = $tree->look_down(_tag => "span"); t "spans found: " . $#spans; foreach (@spans) { if ($_->as_text() =~ /$WORDS{$COUNTRY}->{links}/) { t "anchor point found"; $anchor = $_; last; } } my @links; if ($anchor) { $elem = $anchor; while (($elem = $elem->right()) && ((ref $elem) && ($elem->tag() ne "table"))) { foreach ($elem->find_by_tag_name("_tag"=>"a")) { # is this not begins with 'https?://' add prefix push @links, ($_->attr(q(href)) =~ /^https?:\/\// ? "" : "http://www.$d") . $_->attr(q(href)); t "link url added: " . $_->attr(q(href)) ; } } } push @links, $url; if (defined $prog->{q(url)}) { @{$prog->{q(url)}} = ( @links, @{$prog->{q(url)}} ); } else { push @{$prog->{q(url)}}, @links; } # LONG DESCRIPTION: # new format uses the

block # to separate contents # anchor point: # long desc is in the 3. block; this is right sibling of the 3rd separator # the actual content is inbetween the .... elements t "long desc parsing ..."; my @separators = $tree->look_down(_tag=>"div", class=>"separator"); return if ($#separators < 2); @lines = (); if (($anchor) = $separators[2]->right()) { $joined = $anchor->tag(); if ($anchor->tag() eq "span" && $anchor->attr('class') eq "txt") { push @lines, get_all_text($anchor); $joined = join(" ", @lines); $joined =~ s/\xA0//; # remove the to_text()'s results of $joined =~ s/^\s+//; # remove blanks t "found description: $joined"; if (length($joined)) { delete($prog->{q(desc)}); # strip the desc at the specified command line option (if spec) if (defined ($opt_max_desc_length) && ($opt_max_desc_length < length($joined))) { t "long desc was stripped, at: $opt_max_desc_length."; $joined = substr($joined, 0, $opt_max_desc_length - 3) . "..."; } $prog->{q(desc)} = [[ $joined, $COUNTRY ]] } } } # SERIES NUMBER, CATEGORY, YEAR # anchor point: 2nd separator # all text data is in/under the parent TD element of the 2nd separator # We collect all text data, and parse it from known datas. return if ($#separators < 1); ($anchor) = $separators[1]; if ($anchor->parent()->tag() ne "td" || !defined $anchor->parent->attr('width') || $anchor->parent->attr('width') ne "98%") { # bug #445 t "credits section not found"; return; } # get the rating if available # (AP)

if (my $img = $anchor->parent()->look_down('_tag' => 'img', 'class' => 'age_limit_icon')) { my $rating = $img->attr('alt'); $rating =~ s/[]//g; # strip the brackets my $rating_icon = $img->attr('src'); $prog->{q(rating)} = [[ $rating, '', [{'src' => $rating_icon }] ]]; } # unfortunately the star-rating (vote_box) if fetched with an AJAX call # e.g. http://www.port.sk/arrow/pls/fi/vote.print_vote_box?i_object_id=139692&i_area_id=6&i_reload_container=id%3D%22vote_box%22&i_is_separator=0 # # we could use TreeBuilder->store_cdata(true) to store the cdata under the root node, but I think it's easier to just regexp the html # /*new_from_content($ajaxdata) or die "could not fetch/parse $ajaxurl (infopage)"; worker("ajax-parsing"); # get the "star-rating" # The rating includes the number of votes. Testing for statistical significance calculates that, at # a confidence level of .95, a population of 100 will give 70% confidence in the score being accurate # (and is largely independent of population size). # Therefore only output the rating where the population is > 100. # if (my $anchor = $ajaxtree->look_down(_tag=>"div", class=>qr/starholder/)) { my $starsval; if (my $stars = $anchor->look_down(_tag=>"span", class=>"ctxt")) { $starsval = $stars->as_text(); $starsval =~ s/,/./; $starsval += 0; # convert to float } if (my $votes = $anchor->look_down(_tag=>"div", class=>"votenum")) { my $votesval = $votes->as_text(); if ($votesval) { (my $num_votes) = $votesval =~ /(\d*)/; # if number of votes is >100 then output the star-rating if ($num_votes ne '' && int($num_votes) > 100) { $prog->{q(star-rating)} = [[ sprintf("%.0f / 10", $starsval), "uservotes" ]]; } } } } } # restore the previous values $XMLTV::Get_nice::FailOnError = $get_fail; $XMLTV::Get_nice::Delay = $get_delay; } # collect all text lines, we # achive this to jump to the parent first, and walk all the childs until # the anchor is reached @lines = (); foreach $elem ($anchor->parent()->content_list()) { last if ((ref $elem) && ($elem == $anchor)); push @lines, get_all_text($elem); } # 0:{we are in credits secton}, 1:{duration,year section} my $section = 0; my $job = "foobar"; my $part = ""; my (%credits, $episode, $minutes, $year); my $person = ""; foreach $line (@lines) { $line =~ s/\xA0//; # remove to_text()'s results of &npsp t "processing line: '" . d $line . "'"; foreach $part (split /, */, $line) { $part =~ s/^\s+//; # remove heading blanks $part =~ s/\s+$//; # remove ending blanks $part =~ s/^,*$//; next unless length $part; t "processing part: '$part'"; # we are in credits block if a known hungarian "job:" found $section = 1 if (($section == 0) && (($_) = $part =~ m/\b(.+):/) && (defined($JOBMAP{$_}))); if ($section == 0) { # duration, year, category # possibilitys # 1: amerikai filmdráma sorozat, 90 perc, 2000, 2. rész # 12 éven aluliak számára .... # added 2004-04-07 : # (ro) Coreea de Sud, 2009, serial de aventuri, episodul 5 $_ = $part; SWITCH: { if ((m/\s*([0-9\/]+)\. $WORDS{$COUNTRY}->{episode}/) && (! defined $episode)) { $episode = $1; last SWITCH;} if ((m/$WORDS{$COUNTRY}->{episode} \s*([0-9\/]+)/) && (! defined $episode)) { $episode = $1; last SWITCH;} if ((m/\s*(\d+) $WORDS{$COUNTRY}->{minute}/) && (! defined $minutes)) { $minutes = $1; last SWITCH;} if ((m/\s*([12][0-9]{3})/) && (! defined $year)) { $year = $1; last SWITCH;} { ; } # default -> category, was processed over } t "found episode: '$episode'" if defined $episode; t "found minutes: '$minutes'" if defined $minutes; t "found year: '$year'" if defined $year; } # section 0 if ($section == 1) { # # is there a "hu-job:" string in the part? if yes, we should # push the last readed person, and clear the person string. # if a job is defined (hu-job) but not supported in the DTD # we will add # the person(s) as: # some_job: Foo Bar, Dummy Name, ... # note: \b(.+): do not match to " író: ", because í is not # part of \b # bug #451 [line deleted] if (($_) = $part =~ /^\s*(\S+):/) { # does the $line include a ':' # remove the "jobname:" string $part =~ s/^\s*(\S+):\s*//; t "assuming string is a jobname"; # this means, we should add our until now collected # person to the credits, and begin to collect new # actors... add_person($job, $person, \%credits); t "is this a known job?: '$_'"; # e.g.: hu-job if (defined $JOBMAP{lc($_)} && length($JOBMAP{lc($_)})) { # newly readed part has a en-job (this is defined in DTD, so # this will be the next used job for XML generation t "job known in DTD as: $JOBMAP{lc($_)}"; $job = lc($_); $person = $part; } #en-job else { # this job is not known in DTD, so only en-job, no hu-job; # add as descriped above, set job to foobar to add as actor $job = "foobar"; $person = "$_: $part "; } #hu-job next; } #: in the part # we are here, if: # -> $part holds ':' but it is no hu-job (no en-job) # -> it have no : if ($part =~ /^$.*$$/) { t "found () expression, addint it to person string"; # if it has the from '(...)' the found HTML was: # actor: Arnold Schweizenegger (as the Terminator) # add this to persons and do not push, it. $person .= " $part"; } else { # this is a new name, check how looks person, if it ends # with ":" do not add this to credits, only append, because in the # previuos iteration only hu-job was found. if ($person =~ /:\s*$/) { $person .= " $part"; } else { add_person($job, $person, \%credits); $person = $part; } } } #section 1 } #loop over parts } #loop over $lines # add the last processed data to credits... add_person($job, $person, \%credits) if length($person); t "CREDITS: " . d \%credits; $prog->{q(credits)} = \%credits; #$prog->{q(category)} = [[ $category, $COUNTRY ]] #if defined $category and length $category; $prog->{q(length)} = $minutes * 60 if defined $minutes; $prog->{q(date)} = $year if defined $year ; $prog->{q(episode-num)} = extract_episode( $episode ) if defined $episode ; $tree->delete; } #------------------------------------------------------------------------------- # get_infourl_data_json #------------------------------------------------------------------------------- # desc : merge data from linked info page into programme hash # arguments : 1- reference to the program, whom detailed descr should be grabbed # 2- url to fetch # returns : none #------------------------------------------------------------------------------- sub get_infourl_data_json( $$ ) { my $prog = shift; my $d = domain(); my $url = shift; # add port.hu/port.ro base url only if url is not contains the "://" uri separator if (! ($url =~ "://")) { $url = "https://www.$d" . $url; } # no info, so don't add it to anywhere # -> calendar.event_popup if ($url =~ "calendar\.event_popup") { t "SKIP fetching of slow url: $url"; return; } # do not grab: # -> pictures: ... pls/me/picture.popup?i_area_id # -> dvd rent links page: ... pls/w/logging.page_log?i_page_id=20... # -> sample movie ... video.link_popup?i_object_id=18822 # -> dvd sales page: www.divido.hu... # -> bet on a sport event -> sprotingbet # -> general advert links: adverticum if ($url =~ "(picture.popup|logging.page_log|video.link_popup|www\.divido\.hu|sportingbet|adverticum\.net)") { # add this url to the program push @{$prog->{q(url)}}, $url; t "SKIP fetching of slow url: $url"; return; } t "fetching slow url" . d $url; worker("slow-downloading"); t "fetching $url..."; $XMLTV::Get_nice::FailOnError = 0; my $data; if (! defined($data = get_nice($url))) { worker("slow-parsing"); warn "Could not get URL: $url, the detailed description for the program [" . $prog->{channel} . ", " . $prog->{title} . ", " . $prog->{start} . "] will be not available. Error message: " . error_msg($url) . "." ; return; } else { if ($data =~ //) { my $orig_title = substr($data, index($data, '')+14, index($data, '', index($data, ''))-(index($data, '')+14)); $orig_title =~ s/<[^>]+>//g; $orig_title =~ s/^\s+|\s+$//g; # trim spaces $orig_title =~ s/^[^\n]+\n//g; # remove translated title $orig_title =~ s/^[^\/]+\/(.*)\/$/($1)/g; $orig_title = encode($DEFAULT_ENCODING, decode('utf-8', $orig_title)) if ($DEFAULT_ENCODING !~ /utf\-?8/i); $prog->{q(desc)} = (defined($prog->{q(desc)}) && $prog->{q(desc)} ne "") ? $prog->{q(desc)}.' '.$orig_title : $orig_title; } if ($data =~ //) { my $sum = substr($data, index($data, '')+16, index($data, '', index($data, ''))-(index($data, '')+16)); $sum =~ s/<[^>]+>//g; $sum =~ s/^\s+|\s+$//g; # trim spaces $sum =~ s/magyarul\ besz..l..\,\ //g; $sum = encode($DEFAULT_ENCODING, decode('utf-8', $sum)) if ($DEFAULT_ENCODING !~ /utf\-?8/i); $prog->{q(desc)} = (defined($prog->{q(desc)}) && $prog->{q(desc)} ne "") ? $prog->{q(desc)}.' '.$sum : $sum; } if ($data =~ //) { $data = substr($data, index($data, '

')+25, index($data, '

', index($data, '

'))-(index($data, '

')+25)); $data =~ s/<\/?article>//ig; $data =~ s// /ig; $data =~ s/[^\<]+<\/strong>//ig; # Feliratozva a teletext ... $data =~ s/.*//ig; # Forgalmazó: ... / Bemutató dátuma: ... $data =~ s/^\s+|\s+$//g; # trim spaces $data = encode($DEFAULT_ENCODING, decode('utf-8', $data)) if ($DEFAULT_ENCODING !~ /utf\-?8/i); $prog->{q(desc)} = (defined($prog->{q(desc)}) && $prog->{q(desc)} ne "") ? $prog->{q(desc)}.', '.$data : $data; } worker("slow-parsing"); } } #------------------------------------------------------------------------------- # extract_episode #------------------------------------------------------------------------------- # desc : parse text containing the episode details # arguments : 1- episode data # returns : xmltv episode-num definition #------------------------------------------------------------------------------- sub extract_episode( $ ) { my $episode = shift; my ($episode_num, $season); if(defined($episode)) { if($episode =~ m#(\d+)/(\d+)#) { # episode-num spec with the total number specified. # swap numbers for port.hu, they have total/num my ($num, $total) = ($1, $2); ($num, $total) = ($2, $1) if ($num > $total); # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('. %d/%d .', $num - 1, $total), "xmltv_ns" ], [ $episode, "onscreen" ]]; } elsif($episode =~ m#([IVX]+)\./(\d+)#) { # patch #80 # port.hu style episode numbering: ./. e.g. V./3 # episode-num spec with the total number specified. # decode season from roman numeral $season = arabic ($1); # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('%d . %d .', $season - 1, $2 - 1), "xmltv_ns" ], [ $episode, "onscreen" ]]; } elsif($episode =~ m#(\d+), ([IVX]+)#) { # episode-num spec with the total number specified. # decode season from roman numeral $season = arabic ($2); # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('%d . %d .', $season - 1, $1 - 1), "xmltv_ns" ], [ $episode, "onscreen" ]]; } elsif($episode =~ m#(\d+)#) { # episode-num spec with just the episode number # however XMLTV counts from 0 on ... $episode_num = [[ sprintf('. %d .', $1 - 1), "xmltv_ns" ], [ $episode, "onscreen" ]]; } else { $episode_num = [[ $episode, "onscreen" ]]; } } return $episode_num; } #------------------------------------------------------------------------------- # grab_icon #------------------------------------------------------------------------------- # desc : fetch (if needed and specified) channel icons, returns pointing URL # arguments : 1- channel id (eg 003) # returns : url pointing to tha program's logo (icon) http:|file:... #------------------------------------------------------------------------------- sub grab_icon( $ ) { # if icon not requested return unless ($opt_icons || $opt_local_icons); my $channelid = shift; my $fetchurl = "https://www." . domain() . "/tv/kep_ado/al_".(int(${channelid}) % 10000).".gif"; my ($file, $iconurl); # that $fetchurl no longer works for RO, so... #test if url is valid $XMLTV::Get_nice::FailOnError = 0; my $image = get_nice($fetchurl); if (!defined $image) { # image url not valid, so we must get it from the programmes page. Ideally we would do that during the main grab but this is a Q&D fix # and I don't want to change too much of this code my $url = "https://www." . domain() . "/pls/w/".($COUNTRY eq 'hu' || $COUNTRY eq 'ro' ? 'old' : '')."tv.channel?i_ch=".$channelid."&i_date=".UnixDate('today','%Y-%m-%d')."&i_where=1"; # bug #501 my $data=get_nice($url); my $tree = HTML::TreeBuilder->new_from_content($data) or die "could not fetch/parse $url (grab_icon)\n"; worker("base-parsing"); my $body = $tree->look_down("_tag"=>"body"); my $container = $body->look_down("_tag" => "div", "class" => qr/main-container-100/); if ($container) { if (my $imgdiv = $container->look_down("_tag" => "div", "style" => qr/float\s*:\s*left/, sub { my $imgtag = $_[0]->look_down('_tag' => 'img'); return 0 unless $imgtag; return $imgtag->attr('src') =~ m/https:\/\/media/; } )) { $fetchurl = $imgdiv->look_down('_tag' => 'img')->attr('src'); } } } $XMLTV::Get_nice::FailOnError = 1; return $fetchurl if ($opt_icons && ! $opt_local_icons); # create directory mkdir $opt_local_icons unless (-d $opt_local_icons); # remove multiple /; make absoluth path $_ = "${opt_local_icons}/${channelid}.gif"; s!//!/!g; $file = Cwd::abs_path( $_ ); $iconurl = "file://${file}"; return $iconurl if ($opt_local_icons && $opt_no_fetch_icons); if (! -d $opt_local_icons) { warn "directory not exists, and cannot create: $opt_local_icons; " . "icon will be not grabbed"; return $fetchurl; } if (open(FILE,">$file")) { t "fetching $fetchurl..."; $XMLTV::Get_nice::FailOnError = 0; #if (my $image = get_nice($fetchurl)) { # now grabbed above if (!$image) { $image = get_nice($fetchurl); } if ($image) { t "icon for $channelid grabbed successfully"; print FILE $image; close FILE; # success return $iconurl; } else { warn "Could not download channel-logo for channel $channelid, using remote URL instead. " . "Error message: " . error_msg($fetchurl) . "."; close FILE; unlink $file; return $fetchurl; } } else { warn "cannot create icon file ($file) for channel $channelid, using remote URL instead"; close FILE; unlink $file; return $fetchurl; } return; } #------------------------------------------------------------------------------- # get_channel_urls #------------------------------------------------------------------------------- # desc : grab a channel page fetch (if needed and specified) channel icons, returns pointing URL # arguments : 1- channel id (eg 003) (grab a webpage parse data form there) # OR # 2- reference to a HTML tree's (root) object (searching in it) # returns : array of urls pointing to tha channel's pages/emails #------------------------------------------------------------------------------- sub get_channel_urls( $ ) { my $ch_did = shift; my @result = (); my $chdata; # two sprintf parameters: first: channel_id,, second how many days grabbed my $churlfmt = "https://www." . domain() . "/pls/tv/".($COUNTRY eq 'hu' || $COUNTRY eq 'ro' ? 'old' : '')."tv.channel?i_ch=%d&" . "i_days=1&i_xday=%d&i_where=1"; # bug #501 # url to grab now (4 days - this is the minimum) my $churl = sprintf($churlfmt, $ch_did, 4); # url to add as the information source (4 days - this is the minimum) my $portchurl = sprintf($churlfmt, $ch_did, 4); t "fetching page for channel urls: $churl\n"; worker("base-downloading"); t "fetching $churl..."; $XMLTV::Get_nice::FailOnError = 0; if (! defined($chdata = get_nice($churl))) { worker("base-parsing"); warn "Could not get URL: $churl, the information urls for the channel $ch_did will be not available. " . "Error message: " . error_msg($churl) . "."; push @result, $portchurl; return @result; } my $tree = HTML::TreeBuilder->new_from_content($chdata) or die "could not fetch/parse $churl (channel infopage)"; worker("base-parsing"); my ($anchor, $elem); # we have to way to find the channel URLs: # -> find the channel image (this is in the same TABLE element as the # requested A elements, and if this is not found: # -> try to find a HR element (only one is presented on the page), this # Nth left sibling is the searched TABLE. if (($anchor) = $tree->look_down( _tag => "b", sub { lc($_[0]->as_text()) =~ /web:/ } ) ) { if ($anchor = ($anchor->look_up("_tag"=>"p")->look_down("_tag"=>"a"))) { push @result, $anchor->attr(q(href)); } else { t "channel url not found"; } } else { t "channel url not found"; } # add PORT url, too, this should be the last (and open 3 days if clicked) push @result, $portchurl if defined $portchurl; return @result; } #------------------------------------------------------------------------------- # load_configs #------------------------------------------------------------------------------- # desc : load the tv_grab_huro.conf, jobmap, catmap.$COUNTRY files, and # sets the globals: %CATMAP, %JOBMAP # arguments : none # returns : array of port channel ids: ( 001, 005 ) #------------------------------------------------------------------------------- sub load_configs() { my @config_lines = XMLTV::Config_file::read_lines($CONFIG_FILE); my $line_num = 0; my (@portids, $where, @fields); foreach (@config_lines) { ++ $line_num; next if not defined; $where = "$CONFIG_FILE:$line_num"; if (/^country:?\s+(\w\w)$/) { if ($1 ne 'hu') { die "$where: Country '$1' no longer supported in grabber!\n"; } warn "$where: already seen country\n" if defined $COUNTRY; $COUNTRY = $1; } elsif (/^channel:?\s+(\S+)\s+([^\#]+)/) { my $ch_did = $1; my $ch_name = $2; $ch_name =~ s/\s*$//; push @portids, $ch_did; # FIXME do not store display-name in the config file - it is # ignored here. } else { warn "$CONFIG_FILE:$.: bad line\n"; } } for ($COUNTRY) { if (not defined) { $_ = 'hu'; warn "country not seen in $CONFIG_FILE, assuming '$_'\n"; } } # Lame reverse lookup on %COUNTRIES. foreach (values %COUNTRIES) { if ($_->[0] eq $COUNTRY) { $TZ = $_->[1]; last; } } die "$where: unknown country $COUNTRY\n" if not defined $TZ; # jobmap file # (this is a file, where we store translations of job names from # Hungarian or Romanian language to English. However we leave some # translations blank, namely these that have no field in the credits # structure) # # Read the file with channel mappings. my $jobmap_file = "jobmap"; my $jobmap_str = GetSupplement( 'tv_grab_huro', $jobmap_file ); $line_num = 0; foreach (split( /\n/, $jobmap_str )) { ++ $line_num; tr/\r//d; s/#.*//; next if m/^\s*$/; s/^\s+|\s+$//g; # trim spaces $where = "$jobmap_file:$line_num"; @fields = split m/:/; die "$where: wrong number of fields" if @fields > 2; my ($huro_job, $credits_id) = @fields; $JOBMAP{$huro_job} = defined($credits_id) ? $credits_id : ""; } # read the file with category mappings. # cat_en:cat_hu:regexp my $catmap_file = "catmap.$COUNTRY"; my $catmap_str=""; $catmap_str = GetSupplement( 'tv_grab_huro', $catmap_file ); $line_num = 0; foreach (split( /\n/, $catmap_str )) { ++ $line_num; tr/\r//d; s/#.*//; next if m/^\s*$/; s/^\s+|\s+$//g; # trim spaces $where = "$catmap_file:$line_num"; @fields = split m/:/; die "$where: wrong number of fields" if @fields > 3; my ($cat_en, $cat_hu, $cat_reg) = @fields; $CATMAP{$cat_en} = defined($cat_reg) ? [$cat_reg, $cat_hu] : [$cat_hu, $cat_hu]; } return @portids; } #------------------------------------------------------------------------------- # worker #------------------------------------------------------------------------------- # desc : measure how many seconds will be executed some port of this program # arguments : 1- name of the worker part of this program, currently: # xml-writing, base-downloading, slow-downloading # base-parsing, slow-parsing # returns : none #------------------------------------------------------------------------------- sub worker( $ ) { my $now = time(); my $newworker = shift; if (! defined $WNAME) { $WNAME = $newworker; $WTIMES{$WNAME} = 0; $WSTIME = $now; return; } $WTIMES{$WNAME} += $now - $WSTIME; $WSTIME = $now; $WNAME = $newworker; } #------------------------------------------------------------------------------- # showworkers #------------------------------------------------------------------------------- # desc : prints $WTIMES to the stdout # arguments : none # returns : none #------------------------------------------------------------------------------- sub showworkers() { return if $opt_quiet; return if not $opt_worker_times; my $total = 0; $total += $_ foreach values %WTIMES; $total = 1 unless $total ; # division by zero printf STDERR ("%-20s: %3d:%02dm %3d%%\n", $_, $WTIMES{$_} / 60, $WTIMES{$_} % 60, 100 * $WTIMES{$_} / $total) foreach keys %WTIMES; printf STDERR ("%-20s: %3d:%02dm\n", "total", $total / 60, $total % 60); } #------------------------------------------------------------------------------- # arabic #------------------------------------------------------------------------------- # desc : This example uses Robin Houston's entry in the Perl # : Institute's Roman Numeral Challenge to convert Roman numerals # : to Arabic. http://www.perl.org/wits.html # : (we could just use Roman, but don't want to add the dependency) # arguments : roman number # returns : integer #------------------------------------------------------------------------------- sub arabic( $ ) { my ($n, $d); ($n, $d, $_) = (1, 2, @_); $_ = uc $_ if !/[^a-z]/; for my $v(split//, 'IVXLCDM') { s/\+.*$v/)/; s/$v([^$v+-])/-$n$1/g; s/$v/+$n/g; $n *= $d ^= 7 } /[^-+\d]/ ? () : eval } #------------------------------------------------------------------------------- # M A I N #------------------------------------------------------------------------------- # Whether zero-length programmes should be included in the output. my $WRITE_ZERO_LENGTH = 0; # Get options, including undocumented --cache option. XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); $opt_slow = 0; $opt_full_desc = 0; $opt_days = 8; # default $opt_offset = 0; # default $opt_quiet = 0; # default GetOptions( 'days=i' => \$opt_days, 'offset=i' => \$opt_offset, 'help' => \$opt_help, 'configure' => \$opt_configure, 'gui:s' => \$opt_gui, 'config-file=s' => \$opt_config_file, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, 'slow' => \$opt_slow, 'list-channels' => \$opt_list_channels, 'icons' => \$opt_icons, 'local-icons=s' => \$opt_local_icons, 'no-fetch-icons'=> \$opt_no_fetch_icons, 'now=s' => \$opt_now, 'worker-times' => \$opt_worker_times, 'get-full-description' => \$opt_full_desc, 'max-desc-length=i' => \$opt_max_desc_length ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); usage(1) if $opt_help; my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', $opt_list_channels => 'list-channels'); XMLTV::Ask::init($opt_gui); # File that stores which channels to download. $CONFIG_FILE = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_huro', $opt_quiet, 'tv_grab_hu'); #------------------------------------------------------------------------------- # only configuration #------------------------------------------------------------------------------- if ($mode eq 'configure') { worker("base-parsing"); XMLTV::Config_file::check_no_overwrite($CONFIG_FILE); open(CONF, ">$CONFIG_FILE") or die "cannot write to $CONFIG_FILE: $!"; my $default_cn = 'Hungary'; my $cn = ask_choice('Grab listings for which country?', $default_cn, sort keys %COUNTRIES); $COUNTRY = $COUNTRIES{$cn}[0]; print CONF "country $COUNTRY\t# $cn\n"; # Ask about each channel. (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? get_channels_json() : get_channels; # sets %CHANNELS my @portids = sort keys %CHANNELS; my @names = map { $CHANNELS{$_}->{qw(display-name)}->[0][0] } @portids; my @qs = map { "add channel $_?" } @names; my @want = ask_many_boolean(1, @qs); foreach (@portids) { my $w = shift @want; warn("cannot read input, stopping channel questions"), last if not defined $w; # No need to print to user - XMLTV::Ask is verbose enough. # Print a config line, but comment it out if channel not wanted. print CONF '#' if not $w; my $name = shift @names; print CONF "channel $_ $name\n"; # TODO don't store display-name in config file. } close CONF or warn "cannot close $CONFIG_FILE: $!"; say("Finished configuration."); worker("base-parsing"); showworkers(); exit(); } # Options to be used for XMLTV::Writer. my %w_args; $w_args{encoding} = $DEFAULT_ENCODING; if (defined $opt_output) { my $fh = new IO::File(">$opt_output"); die "cannot write to $opt_output: $!" if not defined $fh; $w_args{OUTPUT} = $fh; } #------------------------------------------------------------------------------- # only channel listing #------------------------------------------------------------------------------- if ($mode eq 'list-channels') { # Write channels mode. worker("base-parsing"); $COUNTRY='hu'; worker("xml-writing"); my $writer = new XMLTV::Writer(%w_args); $writer->start(xhead()); worker("base-parsing"); (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? get_channels_json() : get_channels(); # sets %CHANNELS # sort channels based on their portid my @portids = sort keys %CHANNELS; worker("xml-writing"); $writer->write_channel($CHANNELS{$_}) foreach @portids; $writer->end(); worker("base-parsing"); showworkers(); exit(); } #------------------------------------------------------------------------------- # only grabbing #------------------------------------------------------------------------------- if ($mode eq 'grab') { worker("base-parsing"); my $ch_did; my $bar; my @portids = load_configs(); # sets %CHANNELS (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? get_channels_json($mode) : get_channels($mode); worker("xml-writing"); my $writer = new XMLTV::Writer(%w_args); worker("base-parsing"); # we have to fetch @portids icons, and @portids pages for channel URL # (e.g.: www.hbo.hu) $bar = new XMLTV::ProgressBar('getting channel details ', 2 * @portids) if not $opt_quiet; worker("xml-writing"); $writer->start(xhead()); worker("base-parsing"); # Write channel elements foreach $ch_did (@portids) { if (! $CHANNELS{$ch_did}) { warn "\nWARNING: Channel with port-id $ch_did no more exists on the site, skipping it's channel description grabbing!"; next; } my %channel = %{$CHANNELS{$ch_did}}; worker("base-downloading"); # fetch and get icon url if (my $iconurl = grab_icon( $ch_did )) { $channel{'icon'} = [ { src => $iconurl } ]; } update $bar if not $opt_quiet; worker("base-parsing"); if (($COUNTRY) && ($COUNTRY ne 'hu') && ($COUNTRY ne 'ro') && (my @churls = get_channel_urls( $ch_did ))) { $channel{'url'} = \@churls; } update $bar if not $opt_quiet; worker("xml-writing"); $writer->write_channel(\%channel); worker("base-parsing"); } $bar->finish() if not $opt_quiet; if (!defined($COUNTRY) || (($COUNTRY ne 'hu') && ($COUNTRY ne 'ro'))) { # old, HTML based pages # The grabber's source allows requests of more than one day per page. This can # be done by specifying the i_xday argument with the GET request. # # To not load their server too much (requesting e.g. 14 channels in one shot # should 'cause quite some traffic to the SQL server) I think we shouldn't # query for more then 5 channels per page. With the default of requesting data # for 8 days this leads to 2 requests per channel and grab ... $DAYSPERPAGE = int($opt_days / 5) + (($opt_days % 5) ? 1 : 0); $DAYSPERPAGE = int($opt_days / $DAYSPERPAGE); # We have to request at minimum of four days $DAYSPERPAGE = 4 if ($DAYSPERPAGE<4); } else { # JSON $DAYSPERPAGE = 1; } t "requesting $DAYSPERPAGE days per scraped webpage ..."; # port.hu|ro provide the today's program based on the localtime on # Hungary. So in other lands e.g. Australia (thx Zsolt Bayer) (TZ: EST/AEST) if # there is f.e. friday 22:38 here in Hungary it is saturday 04:38 # so Zsolt will get the programs not for the requested day (the XML will be # correct, just the wrong day is in) # # we cannot use Date::Manip's Date_ConvTZ, because it does not detects # correctly f.e. the Australia/Melbourne zone. (because it uses `date +%Z` # to get the zone, and date will output EST and not AEST :-(). # [we could not use f.e. `date +%z`, becuase what happen on windows?] # # that means: we will here not set the global FETCHOFFSET to fetch # the "today's" program from everywhere on the world, but we will grab # at first 3 pages (0, -1, +1) to find the correct offset. my $now = parse_date("now"); # developer's options --now: what time is it? (measured in local time) $now = parse_date( $opt_now ) if ($opt_now); t "now=$now"; my $startat = DateCalc($now, "$opt_offset days"); my $startatdate = UnixDate($startat, '%Q'); t "start grabbing from (offset added, localtime): $startatdate"; # make list: which date is which day on the website, we will make grabbing # requests based on the @days array my @days; for (my $i = 1 + $opt_offset; $i < 1 + $opt_offset + $opt_days; $i += $DAYSPERPAGE) { push @days, [ $startatdate, $i ]; # calculate the next date: bump a YYYYMMDD date by $DAYSPERPAGE day $startatdate = UnixDate(DateCalc(parse_date($startatdate), "+ $DAYSPERPAGE days"), '%Q'); die "Could not calculate next grabbing date $days[$#days][0] (+$DAYSPERPAGE days)" if not defined $startatdate; } # This progress bar is for both downloading and parsing. Maybe # they could be separate stages. $bar = new XMLTV::ProgressBar('getting program listings', @days * @portids) if not $opt_quiet; foreach my $date_n_day (@days) { my ($idate, $iday) = @$date_n_day; my $some_success = 0; foreach $ch_did (@portids) { if (! $CHANNELS{$ch_did}) { warn "\nWARNING: Channel with port-id $ch_did no more exists on the site, skipping it's program grabbing!"; next; } my @ps = (($COUNTRY) && (($COUNTRY eq 'hu') || ($COUNTRY eq 'ro'))) ? process_json($idate, xid($ch_did), $ch_did, $iday) : process_table($idate, xid($ch_did), $ch_did, $iday); $some_success = 1 if @ps; worker("xml-writing"); $writer->write_programme($_) foreach @ps; worker("base-parsing"); update $bar if not $opt_quiet; } if (@portids and not $some_success) { warn "failed to get any listings for day $iday, stopping\n"; last; } } $bar->finish() if not $opt_quiet; worker("xml-writing"); $writer->end(); worker("base-parsing"); showworkers(); exit(0); } die;