#!/usr/bin/perl -w
# Copyright (C) 2009  Glen Pitt-Pladdy
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
#
# See: http://www.pitt-pladdy.com/blog/_20091122-152049%2B0000%20IMDB%20ratings%20for%20MythTV/


# stuff to ignore - some channels don't seem to know what a "Movie" is
#$IGNORE{standardise("Film 2007 with Jonathan Ross")} = 1;
$IGNORE{standardise('Floyd Around The Med')} = 1;
$IGNORE{standardise('Floyd Uncorked')} = 1;
$IGNORE{standardise('Cooking The Books')} = 1;
$IGNORE{standardise('Property List: Top Spots')} = 1;
$IGNORE{standardise('How Not To Decorate')} = 1;
$IGNORE{standardise('Getting On The Property Ladder')} = 1;

# path to mysql credentials
$MYSQL_CREDENTIALS = "/etc/mythtv/mysql.txt";
# paths to IMDB lists
$IMDB = "$ENV{HOME}/imdb";
$IMDB_AKA = "$IMDB/aka-titles.list";
$IMDB_ISOAKA = "$IMDB/iso-aka-titles.list";
$IMDB_RATINGS = "$IMDB/ratings.list";


# Levenshtein (edit) distances for fuzzy matching
use Text::Levenshtein qw(distance);
$MAXDISTANCE = 3;


# be real nice
setpriority 0,0,20;


# read in db credentials
open CR, $MYSQL_CREDENTIALS
	or die "FATAL - can't read \"$MYSQL_CREDENTIALS\": $!\n";
$linenumber = 0;
while ( defined ( $line = <CR> ) ) {
	chomp $line;
	++$linenumber;
	if ( $line !~ /^(\w+)=(.+)$/ ) {
		die "FATAL - can't parse line $linenumber in \"$MYSQL_CREDENTIALS\"\n";
	}
	$credentials{$1} = $2;
}
close CR;



# read imdb aka list
open AKA, $IMDB_AKA or die "FATAL: can't read \"$IMDB_AKA\": $!\n";
while ( defined ( $line = <AKA> ) ) {
	chomp $line;
	if ( $line =~ /^AKA TITLES LIST$/ ) { last; }
}
<AKA>;
while ( defined ( $line = <AKA> ) ) {
	chomp $line;
	# skip video games
	if ( $line =~ /\(VG\)/ ) { next; }
	# skip blanks
	if ( $line =~ /^$/ ) { next; }
	# end if we hit the last line
	if ( $line =~ /^-----------------------------------------------/ ) { last; }
	# characters at beginning then new title
	if ( $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)$/
		or $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}$/ ) {
		$title = $1;
		$year = $2;
		next;
	}
	# if we match a bracketed line we have the aka title
	if ( $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\)/
		or $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}\)/ ) {
#		and $2 == $year ) {
		$aka = $1;
		$akayear = $2;
		push @{$akadatabase{$year}->{$title}->{$akayear}}, $aka;
		next;
	}
	# error!
	die "!!!! ERROR: can't understand line:\n$line\n";
}
close AKA;

# read imdb iso aka list
open AKA, $IMDB_ISOAKA or die "FATAL: can't read \"$IMDB_ISOAKA\": $!\n";
while ( defined ( $line = <AKA> ) ) {
	chomp $line;
	if ( $line =~ /^AKA TITLES LIST ISO$/ ) { last; }
}
<AKA>;
while ( defined ( $line = <AKA> ) ) {
	chomp $line;
	# skip video games
	if ( $line =~ /\(VG\)/ ) { next; }
	# skip blanks
	if ( $line =~ /^$/ ) { next; }
	# end if we hit the last line
	if ( $line =~ /^-----------------------------------------------/ ) { last; }
	# characters at beginning then new title
	if ( $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)$/
		or $line =~ /^([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}$/ ) {
		$title = $1;
		$year = $2;
		next;
	}
	# if we match a bracketed line we have the aka title
	if ( $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\)/
		or $line =~ /^\s+\(aka\s+([^\s].*)\s\(([\d?]{4}).*\)\s\{.+\}\)/ ) {
#		and $2 == $year ) {
		$aka = $1;
		$akayear = $2;
		push @{$akadatabase{$year}->{$title}->{$akayear}}, $aka;
		next;
	}
	# error!
	die "!!!! ERROR: can't understand line:\n$line\n";
}
close AKA;





# read imdb ratings
open RAT, $IMDB_RATINGS or die "FATAL: can't read \"$IMDB_RATINGS\": $!\n";
while ( defined ( $line = <RAT> ) ) {
	chomp $line;
	if ( $line =~ /^MOVIE RATINGS REPORT$/ ) { last; }
}
<RAT>;<RAT>;
while ( defined ( $line = <RAT> ) ) {
	chomp $line;
	# skip video games
	if ( $line =~ /\(VG\)/ ) { next; }
	# end on blank line
	if ( $line =~ /^$/ ) { last; }
	if ( $line !~ /^\s+[\.\d\*]+\s+\d+\s+([\d\.]+)\s+([^\s].*[^\s])\s+\(([\d?]{4}).*\)/
		and $line !~ /^\s+[\.\d\*]+\s+\d+\s+([\d\.]+)\s+([^\s])\s+\(([\d?]{4}).*\)/ ) {
		print STDERR "bad line: $line\n";
		next;
	}
	# we got the info
	$rating = $1;
	$title = $2;
	$year = $3;
	$title =~ s/^"([^"]+)"$/$1/;
	# fill our db
	$database{$year}->{standardise($title)} = $rating;
	$database{titleonly}->{standardise($title)} = $rating;
	# fill db with aliases
	if ( defined ( $akadatabase{$year}->{$title} ) ) {
		foreach $akayear (keys %{$akadatabase{$year}->{$title}}) {
			foreach $aka (@{$akadatabase{$year}->{$title}->{$akayear}}) {
				# fill our 
#				$database{$akayear}->{standardise($aka)} = $rating;
				$database{$year}->{standardise($aka)} = $rating;
				$database{titleonly}->{standardise($aka)} = $rating;
			}
		}
	}
}
close RAT;


# generate variations
foreach $year (keys %database) {
	foreach $name (keys %{$database{$year}}) {
		$variant = variation ( $name );
		if ( $variant ne $name ) {
			$databasevariant{$year}->{$name} = $database{$year}{$name};
		}
	}
}


use DBI;
$dbh = DBI->connect (
				'DBI:mysql:'.$credentials{DBName}.':'.$credentials{DBHostName},
				$credentials{DBUserName}, $credentials{DBPassword}
			);

# list entries with zero (no) rating
$program_data = $dbh->prepare("SELECT DISTINCT title,subtitle,stars,originalairdate FROM program WHERE category_type = 'movie' AND stars = 0");
$program_data->execute();
while ( @fields = $program_data->fetchrow_array() ) {
	if ( $fields[2] > 0 ) { print "already rated: $fields[0]\n"; next; }		# already rated
	$title = standardise ( $fields[0] );	# title
print ">>> $title\n";
	$subtitle = standardise_sub ( $fields[1] );	# subtitle
	$year = $fields[3];	# originalairdate
	if ( $year ) { $year =~ s/-\d\d-\d\d$//; }	# get year
	# check if we ignore it
	if ( $IGNORE{$title} ) {
		# ignored - give it a minimal score to stop it coming back
		$sql = 'UPDATE program SET stars = '.0.01;
		$sql .= ' WHERE title = '.$dbh->quote($fields[0]);
		$sql .= ' AND subtitle = '.$dbh->quote($fields[1]);
		if ( $year ) {
			$sql .= ' AND originalairdate = '.$dbh->quote($fields[3]);
		}
		# run the update against the database
		$program_update = $dbh->prepare ( $sql );
		$program_update->execute();
		next;
	}
	# get the match
	$rating = bestmatchyear ( $year, $title );
	if ( $rating ) {
print "got rating: $rating\n";
		# set the entry in the db
		$rating /= 10;
		$sql = 'UPDATE program SET stars = '.$rating;
		$sql .= ' WHERE title = '.$dbh->quote($fields[0]);
		$sql .= ' AND subtitle = '.$dbh->quote($fields[1]);
		if ( $year ) {
			$sql .= ' AND originalairdate = '.$dbh->quote($fields[3]);
		}
		# print the line
		if ( ! defined ( $fields[3] ) ) { $fields[3] = "-"; }
		print "* $fields[0] : $fields[3] : $fields[2]\n";
		# run the update against the database
		$program_update = $dbh->prepare ( $sql );
		$program_update->execute();
		next;
	}
	# try mythtv with subtitle
	$rating = bestmatchyear ( $year, "$title $subtitle" );
	if ( $rating ) {
		# set the entry in the db
		$rating /= 10;
		$sql = 'UPDATE program SET stars = '.$rating;
		$sql .= ' WHERE title = '.$dbh->quote($fields[0]);
		$sql .= ' AND subtitle = '.$dbh->quote($fields[1]);
		# prep the sql
		if ( $year ) {
			$sql .= ' AND originalairdate = '.$dbh->quote($fields[3]);
		}
		# print the line
		if ( ! defined ( $fields[3] ) ) { $fields[3] = "-"; }
		print "* $fields[0] : $fields[3] : $fields[2]\n";
		# run the update against the database
		$program_update = $dbh->prepare ( $sql );
		$program_update->execute();
		next;
	}
	# failed to match
	if ( ! defined ( $fields[3] ) ) { $fields[3] = "-"; }
	print "XXXX $fields[0] : $fields[1] : $fields[3] : $fields[2]\n";
}
















sub bestmatchyear {
	my $year = shift;
	my $title = shift;
	my $result;
	# try match in same year
	$result = match ( $year, $title );
	if ( $result ) { return $result; }
	# failing that, if year available then try shifting years by one
	if ( $year ) {
		$result = match ( $year-1, $title );
		if ( $result ) { return $result; }
		$result = match ( $year+1, $title );
		if ( $result ) { return $result; }
	}
}





sub match {
	my $year = shift;
	my $title = shift;
	# check databse for exact match
	my $ratings;
	my $ratingsvariant;
	my @names;
	# check what stuff we have to go on
	if ( $year ) {
		$ratings = $database{$year};
		$ratingsvariant = $databasevariant{$year};
	} else {
		$ratings = $database{titleonly};
		$ratingsvariant = $databasevariant{titleonly};
	}
	# try for an exact match
	if ( $ratings->{$title} ) { return $ratings->{$title}; }
	# tray variants match
	if ( $ratingsvariants->{$title} ) { return $ratingsvariants->{$title}; }
	# check Levenshtein to take care of typos etc.
	# this is VERY slow so only do this if all else fails
	my @distances = distance ( $title, keys %{$ratings} );
	my $distance = 1e6;
	my $counter = 0;
	my $besttitle;
	foreach my $dbtitle (keys %{$ratings}) {
		if ( $distances[$counter] < $distance ) {
#print "$title :: $dbtitle ($distances[$counter])\n";
			$besttitle = $dbtitle;
			$distance = $distances[$counter];
#			if ( $distance == 0 ) { last; }		# perfect match
		}
		++$counter;
	}
	if ( $besttitle and $distance <= $MAXDISTANCE ) {
		return $ratings->{$besttitle};
	}
}







# strip puncuation and use standard representations
sub standardise {
	my $title = shift;
	if ( ! $title ) { return ""; }
	$title = standardise_sub ( $title );
	# extra stuff for titles
	$title =~ s/^a\s(.+)$/$1 a/i;
	$title =~ s/^the\s(.+)$/$1 the/i;
	$title =~ s/^le\s(.+)$/$1 le/i;
	return $title;
}

sub standardise_sub {
	my $title = shift;
	if ( ! $title ) { return ""; }
	$title = lc ( $title );
	$title =~ s/\xb2/2/;
	$title =~ s/&/ and /g;
	$title =~ s/[^\w\s]/ /g;
	$title =~ s/\s000\s000\s/ milion /;
	$title =~ s/\s000\s/ thousand /;
	$title =~ s/\s1\s/ one /;
	$title =~ s/^1\s/one /;
	$title =~ s/\s2\s/ two /;
	$title =~ s/^2\s/two /;
	$title =~ s/\s10\s/ ten /;
	$title =~ s/^10\s/ten /;
	$title =~ s/(\s)\s+/$1/g;
	$title =~ s/^\s+//;
	$title =~ s/\s+$//;
	return $title;
}

# this would be in case they left off bits
sub variation {
	my $title = shift;
	$title =~ s/\sa$//;
	$title =~ s/\sthe$//;
	$title =~ s/\sle$//;
	return $title;
}



