#!/usr/bin/perl use strict; use warnings; # Copyright (C) 2009 Glen Pitt-Pladdy # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # # See: http://www.pitt-pladdy.com/blog/_20091122-152049_0000_IMDB_ratings_for_MythTV/mythtv_listings_imdbmovieratings/ # version: 20110429 # stuff to ignore - some channels don't seem to know what a "Movie" is my %IGNORE; $IGNORE{standardise('Floyd Around The Med')} = 1; $IGNORE{standardise('Floyd Uncorked')} = 1; $IGNORE{standardise('Cooking The Books')} = 1; $IGNORE{standardise('Property List: Top Spots')} = 1; $IGNORE{standardise('How Not To Decorate')} = 1; $IGNORE{standardise('Getting On The Property Ladder')} = 1; $IGNORE{standardise('Matthew Vaughn on Kick Ass')} = 1; $IGNORE{standardise('Bring It On: In It to Win It')} = 1; $IGNORE{standardise('Everyday Adventure')} = 1; $IGNORE{standardise('TView Movie Preview')} = 1; #$IGNORE{standardise('')} = 1; # path to mysql credentials my $MYSQL_CREDENTIALS = "/etc/mythtv/mysql.txt"; # paths to IMDB lists my $IMDB = "$ENV{HOME}/imdb"; my $IMDB_AKA = "$IMDB/aka-titles.list"; my $IMDB_ISOAKA = "$IMDB/iso-aka-titles.list"; my $IMDB_RATINGS = "$IMDB/ratings.list"; # recent no-matches cache my $NOMATCHES = "$IMDB/nomatches"; my $NOMATCHESMAXAGE = $^T - 604800; # 1 week old # Levenshtein (edit) distances for fuzzy matching - rather inefficient! use Text::Levenshtein qw(distance); my $MAXDISTANCE = 3; # be real nice setpriority 0,0,20; # check age of databases if ( -M $IMDB_AKA > 183 or -M $IMDB_ISOAKA > 183 or -M $IMDB_RATINGS > 183 ) { warn "WARNING - some of your IMDB data failes are older than 6 months!\nPlease update them!\n\n"; } # read in db credentials open CR, $MYSQL_CREDENTIALS or die "FATAL - can't read \"$MYSQL_CREDENTIALS\": $!\n"; my $linenumber = 0; my %credentials; while ( defined ( my $line = ) ) { chomp $line; ++$linenumber; if ( $line !~ /^(\w+)=(.+)$/ ) { die "FATAL - can't parse line $linenumber in \"$MYSQL_CREDENTIALS\"\n"; } $credentials{$1} = $2; } close CR; # read in no-matches so we don't re-try matching stuff we can't match my %nomatches; if ( -f $NOMATCHES and -M $NOMATCHES < -M $IMDB_RATINGS ) { # exists and no newer than ratings open NM, $NOMATCHES or die "$0:".__LINE__." FATAL - can't read \"$NOMATCHES\": $!\n"; while ( defined ( my $line = ) ) { chomp $line; my ( $timestamp, $title ) = split ',', $line; if ( $timestamp > $NOMATCHESMAXAGE ) { # recent enough $nomatches{$title} = $timestamp; } } close NM; } # read imdb aka list open AKA, $IMDB_AKA or die "$0:".__LINE__." FATAL - can't read \"$IMDB_AKA\": $!\n"; $linenumber = 0; while ( defined ( my $line = ) ) { ++$linenumber; chomp $line; if ( $line =~ /^AKA TITLES LIST$/ ) { last; } } ; ++$linenumber; my $title; my $year; my %akadatabase; while ( defined ( my $line = ) ) { ++$linenumber; chomp $line; # skip video games if ( $line =~ /\(VG\)/ ) { next; } # skip blanks if ( $line =~ /^$/ ) { next; } # end if we hit the last line if ( $line =~ /^-----------------------------------------------/ ) { last; } # characters at beginning then new title if ( $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)$/ or $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}$/ ) { $title = $1; $year = $2; next; } # if we match a bracketed line we have the aka title if ( $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\)/ or $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}\)/ ) { my $aka = $1; my $akayear = $2; push @{$akadatabase{$year}->{$title}->{$akayear}}, $aka; next; } # error! die "$0:".__LINE__." ERROR: can't understand line $linenumber:\n$line\n"; } close AKA; # read imdb iso aka list open AKA, $IMDB_ISOAKA or die "$0:".__LINE__." FATAL - can't read \"$IMDB_ISOAKA\": $!\n"; $linenumber = 0; while ( defined ( my $line = ) ) { ++$linenumber; chomp $line; if ( $line =~ /^AKA TITLES LIST ISO$/ ) { last; } } ; ++$linenumber; while ( defined ( my $line = ) ) { ++$linenumber; chomp $line; # skip video games if ( $line =~ /\(VG\)/ ) { next; } # skip blanks if ( $line =~ /^$/ ) { next; } # end if we hit the last line if ( $line =~ /^-----------------------------------------------/ ) { last; } # characters at beginning then new title if ( $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)$/ or $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}$/ ) { $title = $1; $year = $2; next; } # if we match a bracketed line we have the aka title if ( $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\)/ or $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}\)/ ) { my $aka = $1; my $akayear = $2; push @{$akadatabase{$year}->{$title}->{$akayear}}, $aka; next; } # error! die "$0:".__LINE__." ERROR: can't understand line $linenumber:\n$line\n"; } close AKA; # read imdb ratings open RAT, $IMDB_RATINGS or die "$0:".__LINE__." FATAL - can't read \"$IMDB_RATINGS\": $!\n"; $linenumber = 0; while ( defined ( my $line = ) ) { ++$linenumber; chomp $line; if ( $line =~ /^MOVIE RATINGS REPORT$/ ) { last; } } ;; ++$linenumber;++$linenumber; my %database; while ( defined ( my $line = ) ) { ++$linenumber; chomp $line; # skip video games if ( $line =~ /\(VG\)/ ) { next; } # end on blank line if ( $line =~ /^$/ ) { last; } if ( $line !~ /^\s+[\.\d\*]+\s+\d+\s+([\d\.]+)\s+([^\s].*[^\s])\s+\(([\d?]{4}).*\)/ and $line !~ /^\s+[\.\d\*]+\s+\d+\s+([\d\.]+)\s+([^\s])\s+\(([\d?]{4}).*\)/ ) { warn "$0:".__LINE__." WARNING: can't understand line $linenumber:\n$line\n"; next; } # we got the info my $rating = $1; my $title = $2; my $year = $3; $title =~ s/^"([^"]+)"$/$1/; # fill our db $database{$year}->{standardise($title)} = $rating; $database{titleonly}->{standardise($title)} = $rating; # fill db with aliases if ( defined ( $akadatabase{$year}->{$title} ) ) { foreach my $akayear (keys %{$akadatabase{$year}->{$title}}) { foreach my $aka (@{$akadatabase{$year}->{$title}->{$akayear}}) { # fill our $database{$year}->{standardise($aka)} = $rating; $database{$akayear}->{standardise($aka)} = $rating; $database{titleonly}->{standardise($aka)} = $rating; } } } } close RAT; # generate variations my %databasevariant; foreach my $year (keys %database) { foreach my $name (keys %{$database{$year}}) { my $variant = variation ( $name ); if ( $variant ne $name ) { $databasevariant{$year}->{$name} = $database{$year}{$name}; } } } use DBI; my $dbh = DBI->connect ( 'DBI:mysql:'.$credentials{DBName}.':'.$credentials{DBHostName}, $credentials{DBUserName}, $credentials{DBPassword} ); # get entries with zero (no) rating my $program_data = $dbh->prepare("SELECT DISTINCT title,subtitle,stars,originalairdate FROM program WHERE category_type = 'movie' AND stars = 0"); $program_data->execute(); # try and match them to something in the database and add the rating while ( my @fields = $program_data->fetchrow_array() ) { my $title = standardise ( $fields[0] ); # title my $subtitle = standardise_sub ( $fields[1] ); # subtitle my $year = $fields[3]; # originalairdate # exclude nomatches - don't waste our time on stuff we can't match if ( $nomatches{$title} ) { next; } print "-------\n"; if ( $year ) { $year =~ s/-\d\d-\d\d$//; # get year print "TryingMatch: $title ($year)\n"; } else { print "TryingMatch: $title (no year)\n"; } # check if we ignore it if ( $IGNORE{$title} ) { # ignored - give it a minimal score to stop it coming back my $sql = 'UPDATE program SET stars = '.0.01; $sql .= ' WHERE title = '.$dbh->quote($fields[0]); $sql .= ' AND subtitle = '.$dbh->quote($fields[1]); if ( $year ) { $sql .= ' AND originalairdate = '.$dbh->quote($fields[3]); } # run the update against the database my $program_update = $dbh->prepare ( $sql ); $program_update->execute(); next; } # get the match my $rating = bestmatchyear ( $year, $title ); if ( $rating ) { print "got rating: $rating\n"; # set the entry in the db $rating /= 10; my $sql = 'UPDATE program SET stars = '.$rating; $sql .= ' WHERE title = '.$dbh->quote($fields[0]); $sql .= ' AND subtitle = '.$dbh->quote($fields[1]); if ( $year ) { $sql .= ' AND originalairdate = '.$dbh->quote($fields[3]); } # print the line if ( ! defined ( $fields[3] ) ) { $fields[3] = "-"; } print "* $fields[0] : $fields[3] : $fields[2]\n"; # run the update against the database my $program_update = $dbh->prepare ( $sql ); $program_update->execute(); next; } # try mythtv with subtitle if ( $subtitle ne '' ) { print "trying with subtitle:\n\t>$title<\n\t>$subtitle<\n"; $rating = bestmatchyear ( $year, "$title $subtitle" ); if ( $rating ) { print "got rating (with subtitle): $rating\n"; # set the entry in the db $rating /= 10; my $sql = 'UPDATE program SET stars = '.$rating; $sql .= ' WHERE title = '.$dbh->quote($fields[0]); $sql .= ' AND subtitle = '.$dbh->quote($fields[1]); # prep the sql if ( $year ) { $sql .= ' AND originalairdate = '.$dbh->quote($fields[3]); } # print the line if ( ! defined ( $fields[3] ) ) { $fields[3] = "-"; } print "* $fields[0] : $fields[3] : $fields[2]\n"; # run the update against the database my $program_update = $dbh->prepare ( $sql ); $program_update->execute(); next; } } # failed to match $nomatches{$title} = $^T; # store the no-match if ( ! defined ( $fields[3] ) ) { $fields[3] = "NoYear"; } print "NoMatch: $fields[0] : $fields[1] : $fields[3] : $fields[2]\n"; } # write no-matches cache for next time open NM, ">$NOMATCHES" or die "$0:".__LINE__." FATAL - can't write \"$NOMATCHES\": $!\n"; foreach my $title (keys %nomatches) { print NM "$nomatches{$title},$title\n"; } close NM; sub bestmatchyear { my ( $year, $title ) = @_; # try match in same year my $result = match ( $year, $title ); if ( $result ) { return $result; } # failing that, if year available then try shifting years by one if ( $year ) { $result = match ( $year-1, $title ); if ( $result ) { return $result; } $result = match ( $year+1, $title ); if ( $result ) { return $result; } } } sub match { my ( $year, $title ) = @_; # check databse for exact match my $ratings; my $ratingsvariant; my @names; # check what stuff we have to go on if ( $year ) { $ratings = $database{$year}; $ratingsvariant = $databasevariant{$year}; } else { $ratings = $database{titleonly}; $ratingsvariant = $databasevariant{titleonly}; } # try for an exact match if ( $ratings->{$title} ) { return $ratings->{$title}; } # try variants match if ( $ratingsvariant->{$title} ) { return $ratingsvariant->{$title}; } # check Levenshtein to take care of typos etc. # this is VERY slow so only do this if all else fails # as an optimisation, only check titles of length +-$MAXDISTANCE my @titles; my $maxlength = length ( $title ) + $MAXDISTANCE; my $minlength = length ( $title ) - $MAXDISTANCE; foreach my $title (keys %{$ratings}) { my $titlelength = length ( $title ); if ( $titlelength <= $maxlength and $titlelength >= $minlength ) { push @titles, $title; } } # now check the Levenshtein distance agains the similar length titles my @distances = distance ( $title, @titles ); my $distance = 1e6; my $counter = 0; my $besttitle; foreach my $dbtitle (@titles) { if ( $distances[$counter] < $distance ) { #print "$title :: $dbtitle ($distances[$counter])\n"; $besttitle = $dbtitle; $distance = $distances[$counter]; # if ( $distance == 0 ) { last; } # perfect match } ++$counter; } if ( $besttitle and $distance <= $MAXDISTANCE ) { return $ratings->{$besttitle}; } } # strip puncuation and use standard representations sub standardise { my $title = shift; if ( ! $title ) { return ""; } $title = standardise_sub ( $title ); # extra stuff for titles $title =~ s/^a\s(.+)$/$1 a/i; $title =~ s/^le\s(.+)$/$1 le/i; return $title; } sub standardise_sub { my $title = shift; if ( ! $title ) { return ""; } $title = lc ( $title ); $title =~ s/\xb2/2/; $title =~ s/&/ and /g; $title =~ s/'//g; # remove apostrophes/quotes $title =~ s/[^\w\s]/ /g; # non-text characters to spaces $title =~ s/\s000\s000\s/ milion /; $title =~ s/\s000\s/ thousand /; $title =~ s/\s1\s/ one /; $title =~ s/^1\s/one /; $title =~ s/\s2\s/ two /; $title =~ s/^2\s/two /; $title =~ s/\s10\s/ ten /; $title =~ s/^10\s/ten /; $title =~ s/(\s)\s+/$1/g; # remove multiple spaces $title =~ s/^\s+//; # strip spaces front/end $title =~ s/\s+$//; $title =~ s/^the\s(.+)$/$1 the/i; # standardise the use of "the" return $title; } # this would be in case they left off bits sub variation { my $title = shift; $title =~ s/\sa$//; $title =~ s/\sthe$//; $title =~ s/\sle$//; return $title; }