def read_file(filename):
    "Read the contents of FILENAME and return as a string."
    infile = open(filename) # windows users should use codecs.open after importing codecs
    contents = infile.read()
    infile.close()
    return contents

text = read_file("data/austen-emma-excerpt.txt")
print(text)

from os import listdir

listdir("data")

def list_textfiles(directory):
    "Return a list of filenames ending in '.txt' in DIRECTORY."
    textfiles = []
    for filename in listdir(directory):
        if filename.endswith(".txt"):
            textfiles.append(directory + "/" + filename)
    return textfiles

for filepath in list_textfiles("data/gutenberg/training"):
    text = read_file(filepath)
    print(filepath +  " has " + str(len(text)) + " characters.")

def end_of_sentence_marker(character):
    # insert your code here

# these tests should return True if your code is correct
print(end_of_sentence_marker("?") == True)
print(end_of_sentence_marker("a") == False)

for element in enumerate("Python"):
    print(element)

for index, character in enumerate("Python"):
    print(index)

def split_sentences(text):
    "Split a text string into a list of sentences."
    sentences = []
    start = 0
    for end, character in enumerate(text):
        if end_of_sentence_marker(character):
            sentence = text[start: end + 1]
            sentences.append(sentence)
            start = end + 1
    return sentences

print(split_sentences("This is a sentence. Should we seperate it from this one?"))

from pyhum.preprocessing import clean_text

def tokenize(text):
    """Transform TEXT into a list of sentences. Lowercase 
    each sentence and remove all punctuation. Finally split each
    sentence into a list of words."""
    # insert your code here

# these tests should return True if your code is correct
print(tokenize("This is a sentence. So, what!") == 
      [["this", "is", "a", "sentence"], ["so", "what"]])

# insert your code here

print(len(corpus))

list_textfiles("data/arabian_nights")[:20]

from os.path import splitext

def remove_ext(filename):
    # insert your code here
    
# these tests should return True if your code is correct
print(remove_ext("data/arabian_nights/1.txt") == "data/arabian_nights/1")
print(remove_ext("ridiculous_selfie.jpg") == "ridiculous_selfie")

from os.path import basename

def remove_dir(filepath):
    # insert your code here
    
# these tests should return True if your code is correct
print(remove_dir("data/arabian_nights/1.txt") == "1.txt")
print(remove_dir("/a/kind/of/funny/filepath/to/file.txt") == "file.txt")

def get_filename(filepath):
    # insert your code here
    
# these tests should return True if your code is correct
print(get_filename("data/arabian_nights/1.txt") == '1')

x_as_string = "1"
x_as_int = int(x_as_string)
print(x_as_int)

x = "1"
y = "2"
print(x + y)

x = 1
y = 2
print(x + y)

def get_night(filepath):
    # insert your code here

# these tests should return True if your code is correct
print(get_night("data/arabian_nights/1.txt") == 1)

filenames = list_textfiles('data/arabian_nights')

filenames.sort(key=get_night)
filenames[:20]

filenames = list_textfiles('data/arabian_nights')
filenames.sort()
print(filenames[:20])

corpus = []
filenames = list_textfiles("data/arabian_nights")
filenames.sort(key=get_night)
for filename in filenames:
    text = read_file(filename)
    corpus.append(tokenize(text))

sentences_per_night = []
for night in corpus:
    sentences_per_night.append(len(night))
print(sentences_per_night[:10])

max(sentences_per_night)

min(sentences_per_night)

print(sum([1, 3, 3, 4]))

# if you use Python 3.x, both print statements will return 
# the same thing and you don't need to worry.
number = 1
print(number)
number = float(number)
print(number)

# insert your code here

words_per_night = []
for night in corpus:
    n_words = 0
    for sentence in night:
        n_words += len(sentence)
    words_per_night.append(n_words)

def story_time(text):
    # insert your code here

# these tests should return True if your code is correct
print(story_time([["story", "story"]]) * 130 == 2.0)

story_time_per_night = []
# insert your code here
print(story_time_per_night[:10])

# insert your code here

import matplotlib.pyplot as plt

plt.plot(sentences_per_night)

# insert your code here

# insert your code here

def positions_of(word):
    #insert your code here

positions_of_shahrazad = positions_of("shahrazad")
positions_of_ali = positions_of("ali")
positions_of_egypt = positions_of("egypt")

plt.figure(figsize=(20, 8))
names = ["Shahrazad", "Ali", "Egypt"]
plt.plot(positions_of_shahrazad, [1]*len(positions_of_shahrazad), "|", markersize=100)
plt.plot(positions_of_ali, [2]*len(positions_of_ali), "|", markersize=100)
plt.plot(positions_of_egypt, [0]*len(positions_of_egypt), "|", markersize=100)
plt.yticks(range(len(names)), names)
_ = plt.ylim(-1, 3)

from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()