def AND(vector_a, vector_b): # insert your code here # these tests should return True if your code is correct print(AND([1, 1, 0, 0], [1, 1, 1, 0]) == [1, 1, 0, 0]) print(AND([1, 0, 0, 1, 0, 0, 1], [1, 1, 1, 0, 1, 0, 1]) == [1, 0, 0, 0, 0, 0, 1]) def AND(*vectors): # insert your code here # these tests should return True if your code is correct print(AND([1, 1, 0, 0], [1, 1, 1, 0], [1, 0, 0, 0]) == [1, 0, 0, 0]) print(AND([1, 1, 1, 0, 1], [1, 0, 0, 1, 0], [0, 1, 1, 0, 1]) == [0, 0, 0, 0, 0]) def NOT(vector): # insert your code here # these tests should return True if your code is correct print(AND([1, 1, 0, 0], [1, 1, 1, 0], NOT([1, 0, 0, 0])) == [0, 1, 0, 0]) import glob, os, re from collections import defaultdict def tokenize(text, lowercase=True): text = text.lower() if lowercase else text for match in re.finditer(r"\w+(\.?\w+)*", text): yield match.group() class IRSystem: """A very simple Information Retrieval System. The constructor s = IRSystem() builds an empty system. Next, index several documents with s.index_document(ID, text). """ def __init__(self): "Initialize an IR Sytem." self.tdf = defaultdict(set) self.doc_ids = [] def index_document(self, doc_id, words): "Add a new unindexed document to the system." self.doc_ids.append(doc_id) # insert your code here def index_collection(self, filenames): "Index a collection of documents." for filename in filenames: self.index_document(os.path.basename(filename), tokenize(open(filename).read())) # these tests should return True if your code is correct s = IRSystem() s.index_collection(glob.glob('data/haggard/*.txt')) print('The Ghost Kings 8184.txt' in s.tdf['master']) print('Cleopatra 2769.txt' in s.tdf['children']) a = {'a', 'b', 'c', 'd'} b = {'c', 'a', 'e', 'f'} print(a.intersection(b)) class IRSystem: """A very simple Information Retrieval System. The constructor s = IRSystem() builds an empty system. Next, index several documents with s.index_document(ID, text). Then ask queries with s.query('term1', 'term2') to retrieve the matching documents.""" def __init__(self): "Initialize an IR Sytem." self.tdf = defaultdict(set) self.doc_ids = [] def index_document(self, doc_id, words): "Add a new unindexed document to the system." self.doc_ids.append(doc_id) for word in words: self.tdf[word].add(doc_id) def index_collection(self, filenames): "Index a collection of documents." for filename in filenames: self.index_document(os.path.basename(filename), tokenize(open(filename).read())) def query(self, *terms): "Query the system for documents in which all terms occur." # insert your code here # these tests should return True if your code is correct s = IRSystem() s.index_collection(glob.glob('data/haggard/*.txt')) print('Beatrice 3096.txt' in s.query("master", "children")) print('Fair Margaret 9780.txt' in s.query("eye", "father", "work")) from collections import Counter class IRSystem: """A very simple Information Retrieval System. The constructor s = IRSystem() builds an empty system. Next, index several documents with s.index_document(ID, text). Then ask queries with s.query('term1', 'term2') to retrieve the matching documents.""" def __init__(self): "Initialize an IR Sytem." self.tdf = defaultdict(Counter) # changed! self.doc_ids = [] def index_document(self, doc_id, words): "Add a new unindexed document to the system." self.doc_ids.append(doc_id) for word in words: self.tdf[word][doc_id] += 1 # changed! def index_collection(self, filenames): "Index a collection of documents." for filename in filenames: self.index_document(os.path.basename(filename), tokenize(open(filename).read())) def query(self, *terms): "Query the system for documents in which all terms occur." return set.intersection(*map(self.tdf.get, terms)) s = IRSystem() s.index_collection(glob.glob('data/haggard/*.txt')) s.tdf['master'].most_common(n=10) class IRSystem: """A very simple Information Retrieval System. The constructor s = IRSystem() builds an empty system. Next, index several documents with s.index_document(ID, text). Then ask queries with s.query('term1', 'term2') to retrieve the top n matching documents.""" def __init__(self): "Initialize an IR Sytem." self.tdf = defaultdict(Counter) self.doc_ids = [] def index_document(self, doc_id, words): "Add a new unindexed document to the system." self.doc_ids.append(doc_id) for word in words: self.tdf[word][doc_id] += 1 def index_collection(self, filenames): "Index a collection of documents." for filename in filenames: self.index_document(os.path.basename(filename), tokenize(open(filename).read())) def score(self, doc_id, *terms): "Score a document for a particular query using the sum of the term frequencies." # insert your code here def query(self, *terms, n=10): """Query the system for documents in which all terms occur. Returns the top n matching documents.""" scores = {doc_id: self.score(doc_id, *terms) for doc_id in self.doc_ids} return sorted(scores, key=scores.get, reverse=True)[:n] # these tests should return True if your code is correct s = IRSystem() s.index_collection(glob.glob('data/haggard/*.txt')) print(s.query("master")[0] == 'The Ancient Allan 5746.txt') print(s.query("egg", "shell")[0] == 'Dawn 10892.txt') class IRSystem: """A very simple Information Retrieval System. The constructor s = IRSystem() builds an empty system. Next, index several documents with s.index_document(ID, text). Then ask queries with s.query('term1', 'term2') to retrieve the top n matching documents.""" def __init__(self): "Initialize an IR Sytem." self.tdf = defaultdict(Counter) self.lengths = Counter() self.doc_ids = [] def index_document(self, doc_id, words): "Add a new unindexed document to the system." self.doc_ids.append(doc_id) # insert your code here def index_collection(self, filenames): "Index a collection of documents." for filename in filenames: self.index_document(os.path.basename(filename), tokenize(open(filename).read())) def score(self, doc_id, *terms): "Score a document for a particular query using the sum of the term frequencies." return sum(self.tdf[term][doc_id] for term in terms) def query(self, *terms, n=10): """Query the system for documents in which all terms occur. Returns the top n matching documents.""" scores = {doc_id: self.score(doc_id, *terms) for doc_id in self.doc_ids} return sorted(scores, key=scores.get, reverse=True)[:n] # these tests should return True if your code is correct s = IRSystem() s.index_collection(glob.glob('data/haggard/*.txt')) print(s.lengths['Dawn 10892.txt'] == 192299) class IRSystem: """A very simple Information Retrieval System. The constructor s = IRSystem() builds an empty system. Next, index several documents with s.index_document(ID, text). Then ask queries with s.query('term1', 'term2') to retrieve the top n matching documents.""" def __init__(self): "Initialize an IR Sytem." self.tdf = defaultdict(Counter) self.lengths = Counter() self.doc_ids = [] self.N = 0 def index_document(self, doc_id, words): "Add a new unindexed document to the system." self.doc_ids.append(doc_id) # insert you code here def index_collection(self, filenames): "Index a collection of documents." for filename in filenames: self.index_document(os.path.basename(filename), tokenize(open(filename).read())) def _document_frequency(self): "Return the document frequency for each term in self.tdf." # insert your code here def score(self, doc_id, *terms): "Score a document for a particular query using the sum of the term frequencies." return sum(self.tdf[term][doc_id] for term in terms) def query(self, *terms, n=10): """Query the system for documents in which all terms occur. Returns the top n matching documents.""" scores = {doc_id: self.score(doc_id, *terms) for doc_id in self.doc_ids} return sorted(scores, key=scores.get, reverse=True)[:n] # these tests should return True if your code is correct s = IRSystem() s.index_collection(glob.glob('data/haggard/*.txt')) print(s._document_frequency()['children'] == 59) import glob, os from math import log class IRSystem: """A very simple Information Retrieval System. The constructor s = IRSystem() builds an empty system. Next, index several documents with s.index_document(text, url). Then ask queries with s.query('term1', 'term2', n=10) to retrieve the top n matching documents.""" def __init__(self, b=0.75, k1=1.2): "Initialize an IR Sytem." self.N = 0 self.lengths = Counter() self.tdf = defaultdict(Counter) self.doc_ids = [] self.b = b self.k1 = k1 self._all_set = False def __repr__(self): return ''.format(self=self) def index_document(self, doc_id, words): "Add a new unindexed document to the system." self.N += 1 self.doc_ids.append(doc_id) for word in words: self.tdf[word][doc_id] += 1 self.lengths[doc_id] += 1 self._all_set = False def index_collection(self, filenames): "Index a collection of documents." for filename in filenames: self.index_document(os.path.basename(filename), tokenize(open(filename).read())) def _document_frequency(self): "Return the document frequency for each term in self.tdf." return {term: len(documents) for term, documents in self.tdf.items()} def score(self, doc_id, *query): "Score a document for a particular query using Okapi BM25." score = 0 length = self.lengths[doc_id] for term in query: tf = self.tdf[term][doc_id] df = self.df.get(term, 0) idf = log((self.N - df + 0.5) / (df + 0.5)) score += (idf * (tf * (self.k1 + 1)) / (tf + self.k1 * (1 - self.b + (self.b * length / self.avg_len)))) return score def query(self, *query, n=10): """Query an indexed collection. Returns a ranked list of doc ID's sorted by the computation of Okapi BM25.""" if not self._all_set: self.df = self._document_frequency() self.avg_len = sum(self.lengths.values()) / self.N self._all_set = True scores = {doc_id: self.score(doc_id, *query) for doc_id in self.doc_ids} return sorted(scores.items(), key=lambda i: i[1], reverse=True)[:n] def present(self, results): "Present the query results as a list." for doc_id, score in results: print("%5.2f | %s" % (100 * score, doc_id)) def present_results(self, *query): "Query the collection and present the results." return self.present(self.query(*query)) s = IRSystem() s.index_collection(glob.glob('data/haggard/*.txt')) s.present_results("regeneration", "pharao", "odds") import urllib.request response = urllib.request.urlopen("https://en.wikipedia.org/wiki/Albert_einstein") response.read()[:1000] from bs4 import BeautifulSoup response = urllib.request.urlopen("https://en.wikipedia.org/wiki/Albert_einstein") page = BeautifulSoup(response.read()) text = page.get_text() print(text[-1000:]) def fetch_page(url): # insert your code here class WebSearcher(IRSystem): # insert your code here searcher = WebSearcher() searcher.index_collection(["https://en.wikipedia.org/wiki/Albert_einstein", "http://nlp.stanford.edu/IR-book/", "http://www.crummy.com/software/BeautifulSoup/"]) searcher.present_results("soup") searcher.present_results("retrieval") from IPython.core.display import HTML def css_styling(): styles = open("styles/custom.css", "r").read() return HTML(styles) css_styling()