#+ setup,include=FALSE source("http://www.cs.utoronto.ca/~radford/csc121/options.r") #+ #' CSC 121, Spring 2017, Large Assignment #2, Part 1 script. #' #' We'll see how well Zipf's Law applies to Jane Ausin's "Pride and Prejudice". source("lga2-defs1.R") text <- scan("http://www.cs.utoronto.ca/~radford/csc121/pride-and-prejudice.txt","") #' Find the counts for each unique word, and sort them to go with ranks of #' 1, 2, 3, ... word_counts <- sort(table(text),decreasing=TRUE) word_ranks <- 1:length(word_counts) #' Plot the counts versus ranks for the whole set of words, on #' logarithmic scales, along with the best fit line, as found by 'lm'. # First, create an empty plot with the right scales. plot (log(word_ranks), log(word_counts), xlab="log word rank", ylab="log word count", type="n") # Add the points and best fit line, and save the result of 'lm' in 'm'. m <- plot_with_line (log(word_ranks), log(word_counts)) # from lga2-defs1.R #' Here are the parameters of the best fit line for all words. coef (m) #' The line does not fit the points very well, and has a slope that is #' not very close to the slope of -1 expected for the original form of #' Zipf's Law. #' #' Based on the plot above, it seems like the words the 20 highest ranks #' are best modelled separately, and the the words with ranks above 700 #' are also best modelled separately. #' Here is the plot of these three groups, with separately fitted lines. # Create an empty plot with the right scales. plot (log(word_ranks), log(word_counts), xlab="log word rank", ylab="log word count", type="n") # Add points and best fit lines for each of the three groups, in different # colours. w1 <- 1:20 m1 <- plot_with_line (log(word_ranks[w1]), log(word_counts[w1]),"orange") w2 <- 21:700 m2 <- plot_with_line (log(word_ranks[w2]), log(word_counts[w2]),"green") w3 <- 701:length(word_counts) m3 <- plot_with_line (log(word_ranks[w3]), log(word_counts[w3]),"blue") #' Here are the parameters of the lines for the three groups. rbind (coef(m1), coef(m2), coef(m3)) #' The fit of the lines to the points is much better when the words are #' divided into three groups. The slope for the middle group of -1.1 is #' close to the slope of -1 that would be expected for the original form #' of Zipf's Law.