def AND(vector_a, vector_b):
    # insert your code here
    
# these tests should return True if your code is correct
print(AND([1, 1, 0, 0], [1, 1, 1, 0]) == [1, 1, 0, 0])
print(AND([1, 0, 0, 1, 0, 0, 1], [1, 1, 1, 0, 1, 0, 1]) == [1, 0, 0, 0, 0, 0, 1])

def AND(*vectors):
    # insert your code here    

# these tests should return True if your code is correct
print(AND([1, 1, 0, 0], [1, 1, 1, 0], [1, 0, 0, 0]) == [1, 0, 0, 0])
print(AND([1, 1, 1, 0, 1], [1, 0, 0, 1, 0], [0, 1, 1, 0, 1]) == [0, 0, 0, 0, 0])

def NOT(vector):
    # insert your code here

# these tests should return True if your code is correct
print(AND([1, 1, 0, 0], [1, 1, 1, 0], NOT([1, 0, 0, 0])) == [0, 1, 0, 0])

import glob, os, re
from collections import defaultdict


def tokenize(text, lowercase=True):
    text = text.lower() if lowercase else text
    for match in re.finditer(r"\w+(\.?\w+)*", text):
        yield match.group()

        
class IRSystem:
    """A very simple Information Retrieval System. The constructor 
    s = IRSystem() builds an empty system. Next, index several documents 
    with s.index_document(ID, text).
    """
    
    def __init__(self):
        "Initialize an IR Sytem."
        self.tdf = defaultdict(set)
        self.doc_ids = []
                
    def index_document(self, doc_id, words):
        "Add a new unindexed document to the system."
        self.doc_ids.append(doc_id)
        # insert your code here
        
    def index_collection(self, filenames):
        "Index a collection of documents."
        for filename in filenames:
            self.index_document(os.path.basename(filename), 
                                tokenize(open(filename).read()))
    
# these tests should return True if your code is correct
s = IRSystem()
s.index_collection(glob.glob('data/haggard/*.txt'))

print('The Ghost Kings 8184.txt' in s.tdf['master'])
print('Cleopatra 2769.txt' in s.tdf['children'])

a = {'a', 'b', 'c', 'd'}
b = {'c', 'a', 'e', 'f'}
print(a.intersection(b))

class IRSystem:
    """A very simple Information Retrieval System. The constructor 
    s = IRSystem() builds an empty system. Next, index several documents 
    with s.index_document(ID, text). Then ask queries with 
    s.query('term1', 'term2') to retrieve the matching documents."""
    
    def __init__(self):
        "Initialize an IR Sytem."
        self.tdf = defaultdict(set)
        self.doc_ids = []
                
    def index_document(self, doc_id, words):
        "Add a new unindexed document to the system."
        self.doc_ids.append(doc_id)
        for word in words:
            self.tdf[word].add(doc_id)

    def index_collection(self, filenames):
        "Index a collection of documents."
        for filename in filenames:
            self.index_document(os.path.basename(filename), 
                                tokenize(open(filename).read()))
            
    def query(self, *terms):
        "Query the system for documents in which all terms occur."
        # insert your code here
    
# these tests should return True if your code is correct
s = IRSystem()
s.index_collection(glob.glob('data/haggard/*.txt'))

print('Beatrice 3096.txt' in s.query("master", "children"))
print('Fair Margaret 9780.txt' in s.query("eye", "father", "work"))

from collections import Counter

class IRSystem:
    """A very simple Information Retrieval System. The constructor 
    s = IRSystem() builds an empty system. Next, index several documents 
    with s.index_document(ID, text). Then ask queries with 
    s.query('term1', 'term2') to retrieve the matching documents."""
    
    def __init__(self):
        "Initialize an IR Sytem."
        self.tdf = defaultdict(Counter) # changed!
        self.doc_ids = []
                
    def index_document(self, doc_id, words):
        "Add a new unindexed document to the system."
        self.doc_ids.append(doc_id)
        for word in words:
            self.tdf[word][doc_id] += 1 # changed!

    def index_collection(self, filenames):
        "Index a collection of documents."
        for filename in filenames:
            self.index_document(os.path.basename(filename), 
                                tokenize(open(filename).read()))            
            
    def query(self, *terms):
        "Query the system for documents in which all terms occur."
        return set.intersection(*map(self.tdf.get, terms))

s = IRSystem()
s.index_collection(glob.glob('data/haggard/*.txt'))

s.tdf['master'].most_common(n=10)

class IRSystem:
    """A very simple Information Retrieval System. The constructor 
    s = IRSystem() builds an empty system. Next, index several 
    documents with s.index_document(ID, text). Then ask queries 
    with s.query('term1', 'term2') to retrieve the top n matching 
    documents."""
    
    def __init__(self):
        "Initialize an IR Sytem."
        self.tdf = defaultdict(Counter)
        self.doc_ids = []
                
    def index_document(self, doc_id, words):
        "Add a new unindexed document to the system."
        self.doc_ids.append(doc_id)
        for word in words:
            self.tdf[word][doc_id] += 1

    def index_collection(self, filenames):
        "Index a collection of documents."
        for filename in filenames:
            self.index_document(os.path.basename(filename), 
                                tokenize(open(filename).read()))
                
    def score(self, doc_id, *terms):
        "Score a document for a particular query using the sum of the term frequencies."
        # insert your code here
            
    def query(self, *terms, n=10):
        """Query the system for documents in which all terms occur. Returns
        the top n matching documents."""
        scores = {doc_id: self.score(doc_id, *terms) for doc_id in self.doc_ids}
        return sorted(scores, key=scores.get, reverse=True)[:n]


# these tests should return True if your code is correct
s = IRSystem()
s.index_collection(glob.glob('data/haggard/*.txt'))

print(s.query("master")[0] == 'The Ancient Allan 5746.txt')
print(s.query("egg", "shell")[0] == 'Dawn 10892.txt')

class IRSystem:
    """A very simple Information Retrieval System. The constructor 
    s = IRSystem() builds an empty system. Next, index several 
    documents with s.index_document(ID, text). Then ask queries 
    with s.query('term1', 'term2') to retrieve the top n matching 
    documents."""
    
    def __init__(self):
        "Initialize an IR Sytem."
        self.tdf = defaultdict(Counter)
        self.lengths = Counter()
        self.doc_ids = []
                
    def index_document(self, doc_id, words):
        "Add a new unindexed document to the system."
        self.doc_ids.append(doc_id)
        # insert your code here

    def index_collection(self, filenames):
        "Index a collection of documents."
        for filename in filenames:
            self.index_document(os.path.basename(filename), 
                                tokenize(open(filename).read()))
            
    def score(self, doc_id, *terms):
        "Score a document for a particular query using the sum of the term frequencies."
        return sum(self.tdf[term][doc_id] for term in terms)
            
    def query(self, *terms, n=10):
        """Query the system for documents in which all terms occur. Returns
        the top n matching documents."""
        scores = {doc_id: self.score(doc_id, *terms) for doc_id in self.doc_ids}
        return sorted(scores, key=scores.get, reverse=True)[:n]


# these tests should return True if your code is correct
s = IRSystem()
s.index_collection(glob.glob('data/haggard/*.txt'))

print(s.lengths['Dawn 10892.txt'] == 192299)

class IRSystem:
    """A very simple Information Retrieval System. The constructor 
    s = IRSystem() builds an empty system. Next, index several 
    documents with s.index_document(ID, text). Then ask queries 
    with s.query('term1', 'term2') to retrieve the top n matching 
    documents."""
    
    def __init__(self):
        "Initialize an IR Sytem."
        self.tdf = defaultdict(Counter)
        self.lengths = Counter()
        self.doc_ids = []
        self.N = 0
                
    def index_document(self, doc_id, words):
        "Add a new unindexed document to the system."
        self.doc_ids.append(doc_id)
        # insert you code here
    
    def index_collection(self, filenames):
        "Index a collection of documents."
        for filename in filenames:
            self.index_document(os.path.basename(filename), 
                                tokenize(open(filename).read()))

    def _document_frequency(self):
        "Return the document frequency for each term in self.tdf."
        # insert your code here
    
    def score(self, doc_id, *terms):
        "Score a document for a particular query using the sum of the term frequencies."
        return sum(self.tdf[term][doc_id] for term in terms)
            
    def query(self, *terms, n=10):
        """Query the system for documents in which all terms occur. Returns
        the top n matching documents."""
        scores = {doc_id: self.score(doc_id, *terms) for doc_id in self.doc_ids}
        return sorted(scores, key=scores.get, reverse=True)[:n]


# these tests should return True if your code is correct
s = IRSystem()
s.index_collection(glob.glob('data/haggard/*.txt'))

print(s._document_frequency()['children'] == 59)

import glob, os
from math import log

class IRSystem:
    """A very simple Information Retrieval System. The constructor 
    s = IRSystem() builds an empty system. Next, index several documents 
    with s.index_document(text, url). Then ask queries with 
    s.query('term1', 'term2', n=10) to retrieve the top n 
    matching documents."""
    
    def __init__(self, b=0.75, k1=1.2):
        "Initialize an IR Sytem."
        self.N = 0
        self.lengths = Counter()
        self.tdf = defaultdict(Counter)
        self.doc_ids = []
        self.b = b
        self.k1 = k1
        self._all_set = False
        
    def __repr__(self):
        return '<IRSystem(b={self.b}, k1={self.k1}, N={self.N})>'.format(self=self)
        
    def index_document(self, doc_id, words):
        "Add a new unindexed document to the system."
        self.N += 1
        self.doc_ids.append(doc_id)
        for word in words:
            self.tdf[word][doc_id] += 1
            self.lengths[doc_id] += 1
        self._all_set = False
        
    def index_collection(self, filenames):
        "Index a collection of documents."
        for filename in filenames:
            self.index_document(os.path.basename(filename), 
                                tokenize(open(filename).read()))
    
    def _document_frequency(self):
        "Return the document frequency for each term in self.tdf."
        return {term: len(documents) for term, documents in self.tdf.items()}
    
    def score(self, doc_id, *query):
        "Score a document for a particular query using Okapi BM25."
        score = 0
        length = self.lengths[doc_id]
        for term in query:
            tf = self.tdf[term][doc_id]
            df = self.df.get(term, 0)
            idf = log((self.N - df + 0.5) / (df + 0.5))
            score += (idf * (tf * (self.k1 + 1)) / 
                          (tf + self.k1 * (1 - self.b + (self.b * length / self.avg_len))))
        return score
    
    def query(self, *query, n=10):
        """Query an indexed collection. Returns a ranked list of doc ID's sorted by
        the computation of Okapi BM25."""
        if not self._all_set:
            self.df = self._document_frequency()
            self.avg_len = sum(self.lengths.values()) / self.N
            self._all_set = True
            
        scores = {doc_id: self.score(doc_id, *query) for doc_id in self.doc_ids}
        return sorted(scores.items(), key=lambda i: i[1], reverse=True)[:n]
    
    def present(self, results):
        "Present the query results as a list."
        for doc_id, score in results:
            print("%5.2f | %s" % (100 * score, doc_id))
            
    def present_results(self, *query):
        "Query the collection and present the results."
        return self.present(self.query(*query))

s = IRSystem()
s.index_collection(glob.glob('data/haggard/*.txt'))

s.present_results("regeneration", "pharao", "odds")

import urllib.request

response = urllib.request.urlopen("https://en.wikipedia.org/wiki/Albert_einstein")
response.read()[:1000]

from bs4 import BeautifulSoup

response = urllib.request.urlopen("https://en.wikipedia.org/wiki/Albert_einstein")
page = BeautifulSoup(response.read())

text = page.get_text()
print(text[-1000:])

def fetch_page(url):
    # insert your code here

class WebSearcher(IRSystem):
    # insert your code here

searcher = WebSearcher()
searcher.index_collection(["https://en.wikipedia.org/wiki/Albert_einstein",
                           "http://nlp.stanford.edu/IR-book/",
                           "http://www.crummy.com/software/BeautifulSoup/"])

searcher.present_results("soup")

searcher.present_results("retrieval")

from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()