# Reads in all the words that we are interested in and returns a sorted list. 
# YOU SHOULD NOT NEED TO CHANGE THIS FUNCTION
def getWordList(filename):
    myFile = open(filename,'r')
    myString = myFile.read()
    myFile.close()

    myList = myString.split()
    myList.sort()

    return myList

# This is Task 1
# Given a file, we will build a word-frequency dictionary for all the words in it.
def buildDictionary(filename):

	wordFreq = {}  # this is the dictionary

	# FILL IN THIS FUNCTION
	# 1. Read in the file and split it into words.

	# 2. Build the dictionary. 
	# a. Start by looping through the list of words
    # b. If the word has been seen before (i.e. it is a key inside wordFreq)
    #    Then, update the value by adding 1 to it.
    #    Otherwise, set the value to 1.
        	
	# 3. Normalization
	#    Some people like to talk a lot. So we will divide the frequency of each
	#    word by the number of words that they spoke.
	# a. Get the list of words from the wordFreq keys
	# b. Loop through that list
    # i. For each word in the dictionary, update its frequency by dividing the value by
    # ii. the number of words in the file
    # iii. (Hint: the number of words in the file is just the length of the allWords list!)


    # 4. Return the dictionary
	return wordFreq

# This is Task 2
# Calculates the "distance" between two word-frequency dictionaries
# We shall use the Pythagoras Theorem that we learned in class.
# We only need to consider the words inside wordList, even though wordFreq1 and wordFreq2 
# may have other words
def calculateDistance(wordFreq1, wordFreq2, wordList):
	distance = 0.0  # This is the distance that we will calculate

	# FILL IN THIS FUNCTION
	# 1. Go through the words in the list of keywords in turn
	# a. Initialize two variables, freq1 and freq2. Set them to zero
	# b. If the word is in wordFreq1, set freq1 to the frequency of the word in wordFreq1
	# c. If the word is in wordFreq2, set freq2 to the frequency of the word in wordFreq2
	# d. Calculate the difference of freq1 and freq2
	# e. Square the difference.
	# f. Add the squared difference to distance
			
	# 2. Calculate the square root of the distance (square root equals to the power 0.5)

	# And we will return the distance
	return distance

# Given a list of files and a file of words, does the pairwise comparison between all files
# in the list based on the frequencies of the words in the file.
# It will return a 2-D matrix that can be printed out into a CSV file
# YOU SHOULD NOT NEED TO CHANGE THIS FUNCTION
def compareSimilarity(fileList, wordFile):
	similarityMatrix = []  # This is the 2-D matrix that will store the similarities
	wordList = getWordList(wordFile)  # read in the keywords
	similarityMatrix.append([" "] + fileList)  # The header row
	for i in range(len(fileList)):
		dict1 = buildDictionary(fileList[i]);
		thisRow = [fileList[i]]  # new row for the matrix, with a header column
		for j in range(len(fileList)):
			dict2 = buildDictionary(fileList[j]);
			thisRow.append(calculateDistance(dict1, dict2, wordList))
		similarityMatrix.append(thisRow)  # adds the row to the matrix
	return similarityMatrix

# This function is given a 2-D similarity matrix and the name of an output file.
# It will print out the similarity matrix in a CSV format
# YOU SHOULD NOT NEED TO CHANGE THIS FUNCTION
def printCSVfile(matrix, filename):
	print("Printing the matrix into", filename)
	outfile = open(filename, "w")
	# go through each row in turn
	for row in matrix:
		line = "" # Make a long string that will hold all the indices in this row
		for element in row:
			line = line + str(element) + ", "
		print(line, file=outfile) # Print to file
	outfile.close()



##########################################################################################
# These functions run the programming statements necessary for Tasks 1, 2, 3 and 4
# You should not need to change anything in this section
##########################################################################################

# These files are for Tasks 1 and 2
# Two files that we will use for testing
def doTasks1_and_2():
	print("Building the word frequency dictionary for cat.txt")
	catFreq = buildDictionary("cat.txt")
	print("The wordFrequency dictionary for cat.txt is", catFreq)  # Check manually to make sure that this is correct.
	print("Building the word frequency dictionary for dog.txt")
	dogFreq = buildDictionary("dog.txt")
	print("The wordFrequency dictionary for dog.txt is", dogFreq) #  Check manually to make sure that this is correct
	keywords = getWordList("keywords.txt")
	distance = calculateDistance(catFreq, dogFreq, keywords)
	print("The distance between cat.txt and dog.txt based on the keywords is", distance)  # Should print 	0.128564869306645

# This function does the necessary stuff for Task 3
def doTask3():
	authorshipResult = "authorship.csv"
	authorshipFiles = ["a0.txt", "a1.txt", "a2.txt", "a3.txt", "a4.txt"]
	stopwordsfile = "stopwords.txt"
	authorshipMatrix = compareSimilarity(authorshipFiles, stopwordsfile)
	printCSVfile(authorshipMatrix, authorshipResult)

# These files are for Task 4
def doTask4():
	politicalViewResult = "politicalView.csv"
	legislatorSpeeches = ["ABRAHAM_SHEK.txt", "ALAN_LEONG.txt", "ALBERT_CHAN.txt", "ALBERT_HO.txt", "ALICE_MAK.txt", "ANDREW_LEUNG.txt", "CHARLES_PETER_MOK.txt", "CHRISTOPHER_CHEUNG.txt", "CHRISTOPHER_CHUNG.txt", "CLAUDIA_MO.txt", "CYD_HO.txt", "DENNIS_KWOK.txt", "ELIZABETH_QUAT.txt", "EMILY_LAU.txt", "FERNANDO_CHEUNG.txt", "FRANKIE_YICK.txt", "FREDERICK_FUNG.txt", "GARY_FAN.txt", "HELENA_WONG.txt", "JAMES_TIEN.txt", "JAMES_TO.txt", "JEFFREY_LAM.txt", "JOSEPH_LEE.txt", "KENNETH_CHAN.txt", "KENNETH_LEUNG.txt", "MARTIN_LIAO.txt", "MICHAEL_TIEN.txt", "PAUL_TSE.txt", "PRISCILLA_LEUNG.txt", "REGINA_IP.txt", "RONNY_TONG.txt", "STARRY_LEE.txt", "STEVEN_HO.txt", "TOMMY_CHEUNG.txt", "TONY_TSE.txt", "VINCENT_FANG.txt"]
	keywordsfile = "keywords.txt"
	legislatorMatrix = compareSimilarity(legislatorSpeeches, keywordsfile)
	printCSVfile(legislatorMatrix, politicalViewResult)

##########################################################################################
# These programming statements call the functions that run Tasks 1, 2, 3 & 4
# Uncomment them as you get to the corresponding task
##########################################################################################

doTasks1_and_2()
# doTask3()  # Don't uncomment this till you have downloaded and unzipped the speech files!
# doTask4()  # Don't uncomment this till you have downloaded and unzipped the legislator files!