#!/usr/bin/perl
use strict;
use warnings;
# Copyright (C) 2009  Glen Pitt-Pladdy
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
#
# See: http://www.pitt-pladdy.com/blog/_20091122-152049_0000_IMDB_ratings_for_MythTV/mythtv_listings_imdbmovieratings/
# version: 20110429


# stuff to ignore - some channels don't seem to know what a "Movie" is
my %IGNORE;
$IGNORE{standardise('Floyd Around The Med')} = 1;
$IGNORE{standardise('Floyd Uncorked')} = 1;
$IGNORE{standardise('Cooking The Books')} = 1;
$IGNORE{standardise('Property List: Top Spots')} = 1;
$IGNORE{standardise('How Not To Decorate')} = 1;
$IGNORE{standardise('Getting On The Property Ladder')} = 1;
$IGNORE{standardise('Matthew Vaughn on Kick Ass')} = 1;
$IGNORE{standardise('Bring It On: In It to Win It')} = 1;
$IGNORE{standardise('Everyday Adventure')} = 1;
$IGNORE{standardise('TView Movie Preview')} = 1;
#$IGNORE{standardise('')} = 1;

# path to mysql credentials
my $MYSQL_CREDENTIALS = "/etc/mythtv/mysql.txt";
# paths to IMDB lists
my $IMDB = "$ENV{HOME}/imdb";
my $IMDB_AKA = "$IMDB/aka-titles.list";
my $IMDB_ISOAKA = "$IMDB/iso-aka-titles.list";
my $IMDB_RATINGS = "$IMDB/ratings.list";
# recent no-matches cache
my $NOMATCHES = "$IMDB/nomatches";
my $NOMATCHESMAXAGE = $^T - 604800;	# 1 week old


# Levenshtein (edit) distances for fuzzy matching - rather inefficient!
use Text::Levenshtein qw(distance);
my $MAXDISTANCE = 3;


# be real nice
setpriority 0,0,20;


# check age of databases
if ( -M $IMDB_AKA > 183
	or -M $IMDB_ISOAKA > 183
	or -M $IMDB_RATINGS > 183 ) {
	warn "WARNING - some of your IMDB data failes are older than 6 months!\nPlease update them!\n\n";
}


# read in db credentials
open CR, $MYSQL_CREDENTIALS
	or die "FATAL - can't read \"$MYSQL_CREDENTIALS\": $!\n";
my $linenumber = 0;
my %credentials;
while ( defined ( my $line = <CR> ) ) {
	chomp $line;
	++$linenumber;
	if ( $line !~ /^(\w+)=(.+)$/ ) {
		die "FATAL - can't parse line $linenumber in \"$MYSQL_CREDENTIALS\"\n";
	}
	$credentials{$1} = $2;
}
close CR;


# read in no-matches so we don't re-try matching stuff we can't match
my %nomatches;
if ( -f $NOMATCHES and -M $NOMATCHES < -M $IMDB_RATINGS ) {	# exists and no newer than ratings
	open NM, $NOMATCHES or die "$0:".__LINE__." FATAL - can't read \"$NOMATCHES\": $!\n";
	while ( defined ( my $line = <NM> ) ) {
		chomp $line;
		my ( $timestamp, $title ) = split ',', $line;
		if ( $timestamp > $NOMATCHESMAXAGE ) {	# recent enough
			$nomatches{$title} = $timestamp;
		}
	}
	close NM;
}


# read imdb aka list
open AKA, $IMDB_AKA or die "$0:".__LINE__." FATAL - can't read \"$IMDB_AKA\": $!\n";
$linenumber = 0;
while ( defined ( my $line = <AKA> ) ) {
	++$linenumber;
	chomp $line;
	if ( $line =~ /^AKA TITLES LIST$/ ) { last; }
}
<AKA>;
++$linenumber;
my $title;
my $year;
my %akadatabase;
while ( defined ( my $line = <AKA> ) ) {
	++$linenumber;
	chomp $line;
	# skip video games
	if ( $line =~ /\(VG\)/ ) { next; }
	# skip blanks
	if ( $line =~ /^$/ ) { next; }
	# end if we hit the last line
	if ( $line =~ /^-----------------------------------------------/ ) { last; }
	# characters at beginning then new title
	if ( $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)$/
		or $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}$/ ) {
		$title = $1;
		$year = $2;
		next;
	}
	# if we match a bracketed line we have the aka title
	if ( $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\)/
		or $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}\)/ ) {
		my $aka = $1;
		my $akayear = $2;
		push @{$akadatabase{$year}->{$title}->{$akayear}}, $aka;
		next;
	}
	# error!
	die "$0:".__LINE__." ERROR: can't understand line $linenumber:\n$line\n";
}
close AKA;

# read imdb iso aka list
open AKA, $IMDB_ISOAKA or die "$0:".__LINE__." FATAL - can't read \"$IMDB_ISOAKA\": $!\n";
$linenumber = 0;
while ( defined ( my $line = <AKA> ) ) {
	++$linenumber;
	chomp $line;
	if ( $line =~ /^AKA TITLES LIST ISO$/ ) { last; }
}
<AKA>;
++$linenumber;
while ( defined ( my $line = <AKA> ) ) {
	++$linenumber;
	chomp $line;
	# skip video games
	if ( $line =~ /\(VG\)/ ) { next; }
	# skip blanks
	if ( $line =~ /^$/ ) { next; }
	# end if we hit the last line
	if ( $line =~ /^-----------------------------------------------/ ) { last; }
	# characters at beginning then new title
	if ( $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)$/
		or $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}$/ ) {
		$title = $1;
		$year = $2;
		next;
	}
	# if we match a bracketed line we have the aka title
	if ( $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\)/
		or $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}\)/ ) {
		my $aka = $1;
		my $akayear = $2;
		push @{$akadatabase{$year}->{$title}->{$akayear}}, $aka;
		next;
	}
	# error!
	die "$0:".__LINE__." ERROR: can't understand line $linenumber:\n$line\n";
}
close AKA;





# read imdb ratings
open RAT, $IMDB_RATINGS or die "$0:".__LINE__." FATAL - can't read \"$IMDB_RATINGS\": $!\n";
$linenumber = 0;
while ( defined ( my $line = <RAT> ) ) {
	++$linenumber;
	chomp $line;
	if ( $line =~ /^MOVIE RATINGS REPORT$/ ) { last; }
}
<RAT>;<RAT>;
++$linenumber;++$linenumber;
my %database;
while ( defined ( my $line = <RAT> ) ) {
	++$linenumber;
	chomp $line;
	# skip video games
	if ( $line =~ /\(VG\)/ ) { next; }
	# end on blank line
	if ( $line =~ /^$/ ) { last; }
	if ( $line !~ /^\s+[\.\d\*]+\s+\d+\s+([\d\.]+)\s+([^\s].*[^\s])\s+\(([\d?]{4}).*\)/
		and $line !~ /^\s+[\.\d\*]+\s+\d+\s+([\d\.]+)\s+([^\s])\s+\(([\d?]{4}).*\)/ ) {
		warn "$0:".__LINE__." WARNING: can't understand line $linenumber:\n$line\n";
		next;
	}
	# we got the info
	my $rating = $1;
	my $title = $2;
	my $year = $3;
	$title =~ s/^"([^"]+)"$/$1/;
	# fill our db
	$database{$year}->{standardise($title)} = $rating;
	$database{titleonly}->{standardise($title)} = $rating;
	# fill db with aliases
	if ( defined ( $akadatabase{$year}->{$title} ) ) {
		foreach my $akayear (keys %{$akadatabase{$year}->{$title}}) {
			foreach my $aka (@{$akadatabase{$year}->{$title}->{$akayear}}) {
				# fill our 
				$database{$year}->{standardise($aka)} = $rating;
				$database{$akayear}->{standardise($aka)} = $rating;
				$database{titleonly}->{standardise($aka)} = $rating;
			}
		}
	}
}
close RAT;


# generate variations
my %databasevariant;
foreach my $year (keys %database) {
	foreach my $name (keys %{$database{$year}}) {
		my $variant = variation ( $name );
		if ( $variant ne $name ) {
			$databasevariant{$year}->{$name} = $database{$year}{$name};
		}
	}
}


use DBI;
my $dbh = DBI->connect (
				'DBI:mysql:'.$credentials{DBName}.':'.$credentials{DBHostName},
				$credentials{DBUserName}, $credentials{DBPassword}
			);

# get entries with zero (no) rating
my $program_data = $dbh->prepare("SELECT DISTINCT title,subtitle,stars,originalairdate FROM program WHERE category_type = 'movie' AND stars = 0");
$program_data->execute();
# try and match them to something in the database and add the rating
while ( my @fields = $program_data->fetchrow_array() ) {
	my $title = standardise ( $fields[0] );	# title
	my $subtitle = standardise_sub ( $fields[1] );	# subtitle
	my $year = $fields[3];	# originalairdate
	# exclude nomatches - don't waste our time on stuff we can't match
	if ( $nomatches{$title} ) { next; }
	print "-------\n";
	if ( $year ) {
		$year =~ s/-\d\d-\d\d$//;	# get year
		print "TryingMatch: $title ($year)\n";
	} else {
		print "TryingMatch: $title (no year)\n";
	}
	# check if we ignore it
	if ( $IGNORE{$title} ) {
		# ignored - give it a minimal score to stop it coming back
		my $sql = 'UPDATE program SET stars = '.0.01;
		$sql .= ' WHERE title = '.$dbh->quote($fields[0]);
		$sql .= ' AND subtitle = '.$dbh->quote($fields[1]);
		if ( $year ) {
			$sql .= ' AND originalairdate = '.$dbh->quote($fields[3]);
		}
		# run the update against the database
		my $program_update = $dbh->prepare ( $sql );
		$program_update->execute();
		next;
	}
	# get the match
	my $rating = bestmatchyear ( $year, $title );
	if ( $rating ) {
		print "got rating: $rating\n";
		# set the entry in the db
		$rating /= 10;
		my $sql = 'UPDATE program SET stars = '.$rating;
		$sql .= ' WHERE title = '.$dbh->quote($fields[0]);
		$sql .= ' AND subtitle = '.$dbh->quote($fields[1]);
		if ( $year ) {
			$sql .= ' AND originalairdate = '.$dbh->quote($fields[3]);
		}
		# print the line
		if ( ! defined ( $fields[3] ) ) { $fields[3] = "-"; }
		print "* $fields[0] : $fields[3] : $fields[2]\n";
		# run the update against the database
		my $program_update = $dbh->prepare ( $sql );
		$program_update->execute();
		next;
	}
	# try mythtv with subtitle
	if ( $subtitle ne '' ) {
		print "trying with subtitle:\n\t>$title<\n\t>$subtitle<\n";
		$rating = bestmatchyear ( $year, "$title $subtitle" );
		if ( $rating ) {
			print "got rating (with subtitle): $rating\n";
			# set the entry in the db
			$rating /= 10;
			my $sql = 'UPDATE program SET stars = '.$rating;
			$sql .= ' WHERE title = '.$dbh->quote($fields[0]);
			$sql .= ' AND subtitle = '.$dbh->quote($fields[1]);
			# prep the sql
			if ( $year ) {
				$sql .= ' AND originalairdate = '.$dbh->quote($fields[3]);
			}
			# print the line
			if ( ! defined ( $fields[3] ) ) { $fields[3] = "-"; }
			print "* $fields[0] : $fields[3] : $fields[2]\n";
			# run the update against the database
			my $program_update = $dbh->prepare ( $sql );
			$program_update->execute();
			next;
		}
	}
	# failed to match
	$nomatches{$title} = $^T;	# store the no-match
	if ( ! defined ( $fields[3] ) ) { $fields[3] = "NoYear"; }
	print "NoMatch: $fields[0] : $fields[1] : $fields[3] : $fields[2]\n";
}


# write no-matches cache for next time
open NM, ">$NOMATCHES" or die "$0:".__LINE__." FATAL - can't write \"$NOMATCHES\": $!\n";
foreach my $title (keys %nomatches) {
	print NM "$nomatches{$title},$title\n";
}
close NM;













sub bestmatchyear {
	my ( $year, $title ) = @_;
	# try match in same year
	my $result = match ( $year, $title );
	if ( $result ) { return $result; }
	# failing that, if year available then try shifting years by one
	if ( $year ) {
		$result = match ( $year-1, $title );
		if ( $result ) { return $result; }
		$result = match ( $year+1, $title );
		if ( $result ) { return $result; }
	}
}





sub match {
	my ( $year, $title ) = @_;
	# check databse for exact match
	my $ratings;
	my $ratingsvariant;
	my @names;
	# check what stuff we have to go on
	if ( $year ) {
		$ratings = $database{$year};
		$ratingsvariant = $databasevariant{$year};
	} else {
		$ratings = $database{titleonly};
		$ratingsvariant = $databasevariant{titleonly};
	}
	# try for an exact match
	if ( $ratings->{$title} ) { return $ratings->{$title}; }
	# try variants match
	if ( $ratingsvariant->{$title} ) { return $ratingsvariant->{$title}; }
	# check Levenshtein to take care of typos etc.
	# this is VERY slow so only do this if all else fails
	# as an optimisation, only check titles of length +-$MAXDISTANCE
	my @titles;
	my $maxlength = length ( $title ) + $MAXDISTANCE;
	my $minlength = length ( $title ) - $MAXDISTANCE;
	foreach my $title (keys %{$ratings}) {
		my $titlelength =  length ( $title );
		if ( $titlelength <= $maxlength and $titlelength >= $minlength ) {
			push @titles, $title;
		}
	}
	# now check the Levenshtein distance agains the similar length titles
	my @distances = distance ( $title, @titles );
	my $distance = 1e6;
	my $counter = 0;
	my $besttitle;
	foreach my $dbtitle (@titles) {
		if ( $distances[$counter] < $distance ) {
#print "$title :: $dbtitle ($distances[$counter])\n";
			$besttitle = $dbtitle;
			$distance = $distances[$counter];
#			if ( $distance == 0 ) { last; }		# perfect match
		}
		++$counter;
	}
	if ( $besttitle and $distance <= $MAXDISTANCE ) {
		return $ratings->{$besttitle};
	}
}







# strip puncuation and use standard representations
sub standardise {
	my $title = shift;
	if ( ! $title ) { return ""; }
	$title = standardise_sub ( $title );
	# extra stuff for titles
	$title =~ s/^a\s(.+)$/$1 a/i;
	$title =~ s/^le\s(.+)$/$1 le/i;
	return $title;
}

sub standardise_sub {
	my $title = shift;
	if ( ! $title ) { return ""; }
	$title = lc ( $title );
	$title =~ s/\xb2/2/;
	$title =~ s/&/ and /g;
	$title =~ s/'//g;	# remove apostrophes/quotes
	$title =~ s/[^\w\s]/ /g;	# non-text characters to spaces
	$title =~ s/\s000\s000\s/ milion /;
	$title =~ s/\s000\s/ thousand /;
	$title =~ s/\s1\s/ one /;
	$title =~ s/^1\s/one /;
	$title =~ s/\s2\s/ two /;
	$title =~ s/^2\s/two /;
	$title =~ s/\s10\s/ ten /;
	$title =~ s/^10\s/ten /;
	$title =~ s/(\s)\s+/$1/g;	# remove multiple spaces
	$title =~ s/^\s+//;	# strip spaces front/end
	$title =~ s/\s+$//;
	$title =~ s/^the\s(.+)$/$1 the/i;	# standardise the use of "the"
	return $title;
}

# this would be in case they left off bits
sub variation {
	my $title = shift;
	$title =~ s/\sa$//;
	$title =~ s/\sthe$//;
	$title =~ s/\sle$//;
	return $title;
}




