def read_file(filename): "Read the contents of FILENAME and return as a string." infile = open(filename) # windows users should use codecs.open after importing codecs contents = infile.read() infile.close() return contents text = read_file("data/austen-emma-excerpt.txt") print(text) from os import listdir listdir("data") def list_textfiles(directory): "Return a list of filenames ending in '.txt' in DIRECTORY." textfiles = [] for filename in listdir(directory): if filename.endswith(".txt"): textfiles.append(directory + "/" + filename) return textfiles for filepath in list_textfiles("data/gutenberg/training"): text = read_file(filepath) print(filepath + " has " + str(len(text)) + " characters.") def end_of_sentence_marker(character): # insert your code here # these tests should return True if your code is correct print(end_of_sentence_marker("?") == True) print(end_of_sentence_marker("a") == False) for element in enumerate("Python"): print(element) for index, character in enumerate("Python"): print(index) def split_sentences(text): "Split a text string into a list of sentences." sentences = [] start = 0 for end, character in enumerate(text): if end_of_sentence_marker(character): sentence = text[start: end + 1] sentences.append(sentence) start = end + 1 return sentences print(split_sentences("This is a sentence. Should we seperate it from this one?")) from pyhum.preprocessing import clean_text def tokenize(text): """Transform TEXT into a list of sentences. Lowercase each sentence and remove all punctuation. Finally split each sentence into a list of words.""" # insert your code here # these tests should return True if your code is correct print(tokenize("This is a sentence. So, what!") == [["this", "is", "a", "sentence"], ["so", "what"]]) # insert your code here print(len(corpus)) list_textfiles("data/arabian_nights")[:20] from os.path import splitext def remove_ext(filename): # insert your code here # these tests should return True if your code is correct print(remove_ext("data/arabian_nights/1.txt") == "data/arabian_nights/1") print(remove_ext("ridiculous_selfie.jpg") == "ridiculous_selfie") from os.path import basename def remove_dir(filepath): # insert your code here # these tests should return True if your code is correct print(remove_dir("data/arabian_nights/1.txt") == "1.txt") print(remove_dir("/a/kind/of/funny/filepath/to/file.txt") == "file.txt") def get_filename(filepath): # insert your code here # these tests should return True if your code is correct print(get_filename("data/arabian_nights/1.txt") == '1') x_as_string = "1" x_as_int = int(x_as_string) print(x_as_int) x = "1" y = "2" print(x + y) x = 1 y = 2 print(x + y) def get_night(filepath): # insert your code here # these tests should return True if your code is correct print(get_night("data/arabian_nights/1.txt") == 1) filenames = list_textfiles('data/arabian_nights') filenames.sort(key=get_night) filenames[:20] filenames = list_textfiles('data/arabian_nights') filenames.sort() print(filenames[:20]) corpus = [] filenames = list_textfiles("data/arabian_nights") filenames.sort(key=get_night) for filename in filenames: text = read_file(filename) corpus.append(tokenize(text)) sentences_per_night = [] for night in corpus: sentences_per_night.append(len(night)) print(sentences_per_night[:10]) max(sentences_per_night) min(sentences_per_night) print(sum([1, 3, 3, 4])) # if you use Python 3.x, both print statements will return # the same thing and you don't need to worry. number = 1 print(number) number = float(number) print(number) # insert your code here words_per_night = [] for night in corpus: n_words = 0 for sentence in night: n_words += len(sentence) words_per_night.append(n_words) def story_time(text): # insert your code here # these tests should return True if your code is correct print(story_time([["story", "story"]]) * 130 == 2.0) story_time_per_night = [] # insert your code here print(story_time_per_night[:10]) # insert your code here import matplotlib.pyplot as plt plt.plot(sentences_per_night) # insert your code here # insert your code here def positions_of(word): #insert your code here positions_of_shahrazad = positions_of("shahrazad") positions_of_ali = positions_of("ali") positions_of_egypt = positions_of("egypt") plt.figure(figsize=(20, 8)) names = ["Shahrazad", "Ali", "Egypt"] plt.plot(positions_of_shahrazad, [1]*len(positions_of_shahrazad), "|", markersize=100) plt.plot(positions_of_ali, [2]*len(positions_of_ali), "|", markersize=100) plt.plot(positions_of_egypt, [0]*len(positions_of_egypt), "|", markersize=100) plt.yticks(range(len(names)), names) _ = plt.ylim(-1, 3) from IPython.core.display import HTML def css_styling(): styles = open("styles/custom.css", "r").read() return HTML(styles) css_styling()