import java.io.*;
import java.lang.*;
import java.sql.*;
import java.util.*;

/**
 *  Gets the words frequencies from all movies the user has rated.
 *  This only counts words for ratings that have been included in the training set.
 *
 * @author     Brandon Douthit-Wood
 * @created    March 31, 2004
 */
public class GetUserWordFrequency {

	// hash tables used to store words and frequencies
	private Hashtable positiveHash, negativeHash, stopList;

	/** Initializes the hashtables, connects to the DB */
	public GetUserWordFrequency() {
		stopList = new Hashtable();
		positiveHash = new Hashtable();
		negativeHash = new Hashtable();

		if ( !Query.connectToDB() ) {
			System.exit( 0 );
		}
	}

	/**  Calculates word frequencies for all movies user has rated positively (3-5) */
	public void calculatePositiveWordFrequency() {
		String id;
		String words;
		String word;
		String query;
		String title;
		String newWords;
		Hashtable hash;
		Integer numWords;
		Integer globalNumWords;
		StringTokenizer token;
		BufferedReader input;
		Enumeration enum;

		// delimiters to strip out of words
		String delims = " \t\n\r\f`~!@#$%^&*()_-+={[}]|\\<,>.?/:;\"'";

		while ( true ) {
			// continually loop through users until all have been processed
			query = "select * from user where pos_word_freq='' limit 1000";
			ResultSet userResult = Query.executeQuery( query );
			if ( Query.getNumResults( userResult ) == 0 ) {
				return;
			}

			try {
				while ( userResult.next() ) {
					id = userResult.getString( "id" );
					words = userResult.getString( "pos_words" );

					hash = new Hashtable();
					newWords = "";

					// tokenize words from the movie
					token = new StringTokenizer( words, delims );
					while ( token.hasMoreTokens() ) {
						word = token.nextToken();
						word = word.toLowerCase();

						// throw out any words in stopList
						if ( stopList.containsKey( word ) ) {
							continue;
						}
						// throw out other junk
						if ( word.equals( "ii" ) || word.equals( "iii" ) || word.equals( "iv" )
							 || word.equals( "jr" ) || word.equals( "sr" ) || word.equals( "x" ) ) {
							continue;
						}

						// lookup the current word in hashtable
						numWords = (Integer) hash.get( word );
						// not in hashtable, add it
						if ( numWords == null ) {
							numWords = new Integer( 1 );
							hash.put( word, numWords );
						}
						// allready in hashtable, increment frequency count
						else {
							numWords = new Integer( numWords.intValue() + 1 );
							hash.put( word, numWords );
						}
					}

					// calculate frequencies
					int size = hash.size();
					int num;
					enum = hash.keys();
					while ( enum.hasMoreElements() ) {
						word = (String) enum.nextElement();
						num = ( (Integer) hash.get( word ) ).intValue();
						if ( num == 1 ) {
							continue;
						}

						double freq = (double) num / size;
						newWords += word + ":" + freq + ":";
					}

					// if word frequency is less than 3, we will ignore it
					if ( newWords.length() < 3 ) {
						newWords = "x";
					}
					else {
						newWords = newWords.substring( 0, newWords.length() - 1 );
					}

					// update the user table
					query = "update user set pos_word_freq='" + newWords + "' where id=" + id;
					Query.executeUpdate( query );
					System.out.println( id );
				}
				userResult.close();
			}
			catch ( SQLException e ) {
				System.out.println( "Error parsing words..." );
				e.printStackTrace();
				System.exit( 0 );
			}
		}
	}

	/**  Calculate the word frequency for all words the user has rated negatively (1-2) */
	public void calculateNegativeWordFrequency() {
		String id;
		String words;
		String word;
		String query;
		String title;
		String newWords;
		Hashtable hash;
		Integer numWords;
		Integer globalNumWords;
		StringTokenizer token;
		BufferedReader input;
		Enumeration enum;

		// delimiters to be thrown out of words
		String delims = " \t\n\r\f`~!@#$%^&*()_-+={[}]|\\<,>.?/:;\"'";

		while ( true ) {
			// continue to loop through users until they have all been processed
			query = "select * from user where neg_word_freq='' limit 1000";
			ResultSet userResult = Query.executeQuery( query );
			if ( Query.getNumResults( userResult ) == 0 ) {
				return;
			}

			try {
				while ( userResult.next() ) {
					id = userResult.getString( "id" );
					words = userResult.getString( "neg_words" );

					hash = new Hashtable();
					newWords = "";

					// tokenize words
					token = new StringTokenizer( words, delims );
					while ( token.hasMoreTokens() ) {
						word = token.nextToken();
						word = word.toLowerCase();

						// throw out any words in stopList
						if ( stopList.containsKey( word ) ) {
							continue;
						}
						// throw out other junk
						if ( word.equals( "ii" ) || word.equals( "iii" ) || word.equals( "iv" )
							 || word.equals( "jr" ) || word.equals( "sr" ) || word.equals( "x" ) ) {
							continue;
						}

						// lookup the current word in hashtable
						numWords = (Integer) hash.get( word );
						// not in hashtable, add it
						if ( numWords == null ) {
							numWords = new Integer( 1 );
							hash.put( word, numWords );
						}
						// allready in hashtable, increment frequency count
						else {
							numWords = new Integer( numWords.intValue() + 1 );
							hash.put( word, numWords );
						}
					}

					// calculate frequencies
					int size = hash.size();
					int num;
					enum = hash.keys();
					while ( enum.hasMoreElements() ) {
						word = (String) enum.nextElement();
						num = ( (Integer) hash.get( word ) ).intValue();
						if ( num == 1 ) {
							continue;
						}

						double freq = (double) num / size;
						newWords += word + ":" + freq + ":";
					}

					// if frequency is less than 3, we will ignore that word
					if ( newWords.length() < 3 ) {
						newWords = "x";
					}
					else {
						newWords = newWords.substring( 0, newWords.length() - 1 );
					}

					// update the user table
					query = "update user set neg_word_freq='" + newWords + "' where id=" + id;
					Query.executeUpdate( query );
					System.out.println( id );
				}
				userResult.close();
			}
			catch ( SQLException e ) {
				System.out.println( "Error parsing words..." );
				e.printStackTrace();
				System.exit( 0 );
			}
		}
	}

	/**
	 *  Reads the stop list into a hash table - these are common words and should be ignored since
	 *  they do not provide much contextual information.
	 */
	public void readStopList() {
		String filename = "stoplist.txt";
		String word;
		BufferedReader input;

		try {
			// open stop list file for reading
			input = new BufferedReader( new FileReader( filename ) );
			word = input.readLine();

			// add words to stoplist hashtable
			while ( word != null ) {
				stopList.put( word, word );
				word = input.readLine();
			}
			input.close();
		}
		catch ( FileNotFoundException e ) {
			System.out.println( "Could not find stoplist file: " + filename );
			e.printStackTrace();
			System.exit( 0 );
		}
		catch ( IOException e ) {
			System.err.println( "Error reading stoplist file: " + filename );
			e.printStackTrace();
			System.exit( 0 );
		}
	}

	/**
	 *  Gets the word frequencies for the movies each user has rated.
	 *
	 * @param  args  The command line arguments
	 */
	public static void main( String[] args ) {
		GetUserWordFrequency words = new GetUserWordFrequency();
		words.readStopList();
		System.out.println( "getting positive word frequencies..." );
		words.calculatePositiveWordFrequency();
		System.out.println( "getting negative word frequencies..." );
		words.calculateNegativeWordFrequency();
	}
}

