# Reads in all the words that we are interested in and returns a sorted list. # YOU SHOULD NOT NEED TO CHANGE THIS FUNCTION def getWordList(filename): myFile = open(filename,'r') myString = myFile.read() myFile.close() myList = myString.split() myList.sort() return myList # This is Task 1 # Given a file, we will build a word-frequency dictionary for all the words in it. def buildDictionary(filename): wordFreq = {} # this is the dictionary # FILL IN THIS FUNCTION # 1. Read in the file and split it into words. # 2. Build the dictionary. # a. Start by looping through the list of words # b. If the word has been seen before (i.e. it is a key inside wordFreq) # Then, update the value by adding 1 to it. # Otherwise, set the value to 1. # 3. Normalization # Some people like to talk a lot. So we will divide the frequency of each # word by the number of words that they spoke. # a. Get the list of words from the wordFreq keys # b. Loop through that list # i. For each word in the dictionary, update its frequency by dividing the value by # ii. the number of words in the file # iii. (Hint: the number of words in the file is just the length of the allWords list!) # 4. Return the dictionary return wordFreq # This is Task 2 # Calculates the "distance" between two word-frequency dictionaries # We shall use the Pythagoras Theorem that we learned in class. # We only need to consider the words inside wordList, even though wordFreq1 and wordFreq2 # may have other words def calculateDistance(wordFreq1, wordFreq2, wordList): distance = 0.0 # This is the distance that we will calculate # FILL IN THIS FUNCTION # 1. Go through the words in the list of keywords in turn # a. Initialize two variables, freq1 and freq2. Set them to zero # b. If the word is in wordFreq1, set freq1 to the frequency of the word in wordFreq1 # c. If the word is in wordFreq2, set freq2 to the frequency of the word in wordFreq2 # d. Calculate the difference of freq1 and freq2 # e. Square the difference. # f. Add the squared difference to distance # 2. Calculate the square root of the distance (square root equals to the power 0.5) # And we will return the distance return distance # Given a list of files and a file of words, does the pairwise comparison between all files # in the list based on the frequencies of the words in the file. # It will return a 2-D matrix that can be printed out into a CSV file # YOU SHOULD NOT NEED TO CHANGE THIS FUNCTION def compareSimilarity(fileList, wordFile): similarityMatrix = [] # This is the 2-D matrix that will store the similarities wordList = getWordList(wordFile) # read in the keywords similarityMatrix.append([" "] + fileList) # The header row for i in range(len(fileList)): dict1 = buildDictionary(fileList[i]); thisRow = [fileList[i]] # new row for the matrix, with a header column for j in range(len(fileList)): dict2 = buildDictionary(fileList[j]); thisRow.append(calculateDistance(dict1, dict2, wordList)) similarityMatrix.append(thisRow) # adds the row to the matrix return similarityMatrix # This function is given a 2-D similarity matrix and the name of an output file. # It will print out the similarity matrix in a CSV format # YOU SHOULD NOT NEED TO CHANGE THIS FUNCTION def printCSVfile(matrix, filename): print("Printing the matrix into", filename) outfile = open(filename, "w") # go through each row in turn for row in matrix: line = "" # Make a long string that will hold all the indices in this row for element in row: line = line + str(element) + ", " print(line, file=outfile) # Print to file outfile.close() ########################################################################################## # These functions run the programming statements necessary for Tasks 1, 2, 3 and 4 # You should not need to change anything in this section ########################################################################################## # These files are for Tasks 1 and 2 # Two files that we will use for testing def doTasks1_and_2(): print("Building the word frequency dictionary for cat.txt") catFreq = buildDictionary("cat.txt") print("The wordFrequency dictionary for cat.txt is", catFreq) # Check manually to make sure that this is correct. print("Building the word frequency dictionary for dog.txt") dogFreq = buildDictionary("dog.txt") print("The wordFrequency dictionary for dog.txt is", dogFreq) # Check manually to make sure that this is correct keywords = getWordList("keywords.txt") distance = calculateDistance(catFreq, dogFreq, keywords) print("The distance between cat.txt and dog.txt based on the keywords is", distance) # Should print 0.128564869306645 # This function does the necessary stuff for Task 3 def doTask3(): authorshipResult = "authorship.csv" authorshipFiles = ["a0.txt", "a1.txt", "a2.txt", "a3.txt", "a4.txt"] stopwordsfile = "stopwords.txt" authorshipMatrix = compareSimilarity(authorshipFiles, stopwordsfile) printCSVfile(authorshipMatrix, authorshipResult) # These files are for Task 4 def doTask4(): politicalViewResult = "politicalView.csv" legislatorSpeeches = ["ABRAHAM_SHEK.txt", "ALAN_LEONG.txt", "ALBERT_CHAN.txt", "ALBERT_HO.txt", "ALICE_MAK.txt", "ANDREW_LEUNG.txt", "CHARLES_PETER_MOK.txt", "CHRISTOPHER_CHEUNG.txt", "CHRISTOPHER_CHUNG.txt", "CLAUDIA_MO.txt", "CYD_HO.txt", "DENNIS_KWOK.txt", "ELIZABETH_QUAT.txt", "EMILY_LAU.txt", "FERNANDO_CHEUNG.txt", "FRANKIE_YICK.txt", "FREDERICK_FUNG.txt", "GARY_FAN.txt", "HELENA_WONG.txt", "JAMES_TIEN.txt", "JAMES_TO.txt", "JEFFREY_LAM.txt", "JOSEPH_LEE.txt", "KENNETH_CHAN.txt", "KENNETH_LEUNG.txt", "MARTIN_LIAO.txt", "MICHAEL_TIEN.txt", "PAUL_TSE.txt", "PRISCILLA_LEUNG.txt", "REGINA_IP.txt", "RONNY_TONG.txt", "STARRY_LEE.txt", "STEVEN_HO.txt", "TOMMY_CHEUNG.txt", "TONY_TSE.txt", "VINCENT_FANG.txt"] keywordsfile = "keywords.txt" legislatorMatrix = compareSimilarity(legislatorSpeeches, keywordsfile) printCSVfile(legislatorMatrix, politicalViewResult) ########################################################################################## # These programming statements call the functions that run Tasks 1, 2, 3 & 4 # Uncomment them as you get to the corresponding task ########################################################################################## doTasks1_and_2() # doTask3() # Don't uncomment this till you have downloaded and unzipped the speech files! # doTask4() # Don't uncomment this till you have downloaded and unzipped the legislator files!