from whoosh.fields import Schema from whoosh.fields import ID, KEYWORD, TEXT pdf_schema = Schema(id = ID(unique=True, stored=True), path = ID(stored=True), source = ID(stored=True), author = TEXT(stored=True), title = TEXT(stored=True), text = TEXT) cd pydf/ import os from whoosh.index import create_in if not os.path.exists("pdf-index"): os.mkdir("pdf-index") index = create_in("pdf-index", pdf_schema) from whoosh.index import open_dir index = open_dir("pdf-index") writer = index.writer() writer.add_document(id = 'blei2003', path = 'data/blei2003.txt', source = 'static/pdfs/blei2003.pdf', author = 'David Blei, Andrew Ng, Michael Jordan', title = 'Latent Dirichlet Allocation', text = open('data/blei2003.txt', encoding='utf-8').read()) writer.add_document(id = 'goodwyn2013', path = 'data/goodwyn2013.txt', source = 'static/pdfs/goodwyn2013.pdf', author = 'Erik Goodwyn', title = 'Recurrent motifs as resonant attractor states in the narrative field: a testable model of archetype', text = open('data/goodwyn2013.txt', encoding='utf-8').read()) writer.add_document(id = 'meij2009', path = 'data/meij2009.txt', source = 'static/pdfs/meij2009.pdf', author = 'Edgar Meij, Dolf Trieschnigg, Maarten de Rijke, Wessel Kraaij', title = 'Conceptual language models for domain-specific retrieval', text = open('data/meij2009.txt', encoding='utf-8').read()) writer.add_document(id = 'muellner2011', path = 'data/muellner2011.txt', source = 'static/pdfs/muellner2011.pdf', author = 'David Muellner', title = 'Modern hierarchical, agglomerative clustering algorithms', text = open('data/muellner2011.txt', encoding='utf-8').read()) writer.commit() searcher = index.searcher() from whoosh.query import Term, And query = And([Term("text", "model"), Term("text", "topic")]) results = searcher.search(query) print('Number of hits:', len(results)) print('Best hit:', results[0]) from whoosh.query import Or # insert your code here # insert your code here from whoosh.qparser import QueryParser parser = QueryParser("text", index.schema) parser.parse("probability model prior") parser.parse("(cluster OR grouping) AND (model OR schema)") parser.parse("topic index author:'Dolf Trieschnigg'") parser.parse("clust*") # insert your code here import subprocess subprocess.call(['ls', '-l']) subprocess.call(['pdftotext', 'pdfs/blei2003.pdf', 'data/blei2003.txt']) subprocess.call(['pdftotext', '-enc', 'UTF-8', 'pdfs/blei2003.pdf', 'data/blei2003.txt']) import os def pdftotext(pdf): # insert your code here # if your answer is correct this should print the first 1000 bytes of the text file pdftotext("pdfs/blei2003.pdf") with open(os.path.join('data', 'blei2003.txt')) as infile: print(infile.read(1000)) subprocess.call(['pdftotext', '-htmlmeta', '-enc', 'UTF-8', 'pdfs/muellner2011.pdf', 'data/muellner2011.html']) print(open('data/muellner2011.html').read(500)) from bs4 import BeautifulSoup def parse_html(filename): """Extract the Author, Title and Text from a HTML file which was produced by pdftotext with the option -htmlmeta.""" with open(filename) as infile: html = BeautifulSoup(infile, "html.parser", from_encoding='utf-8') d = {'text': html.pre.text} if html.title is not None: d['title'] = html.title.text for meta in html.findAll('meta'): try: if meta['name'] in ('Author', 'Title'): d[meta['name'].lower()] = meta['content'] except KeyError: continue return d parse_html('data/muellner2011.html') def pdftotext(pdf): """Convert a pdf to a text file. Extract the Author and Title and return a dictionary consisting of the author, title and text.""" basename, _ = os.path.splitext(os.path.basename(pdf)) # insert your code here import shutil def pdftotext(pdf): """Convert a pdf to a text file. Extract the Author and Title and return a dictionary consisting of the author, title, text the source path, the path of the converted text file and the file ID.""" basename, _ = os.path.splitext(os.path.basename(pdf)) subprocess.call(['pdftotext', '-enc', 'UTF-8', '-htmlmeta', pdf, os.path.join('data', basename + '.html')]) data = parse_html(os.path.join('data', basename + '.html')) with open(os.path.join('data', basename + '.txt'), 'w') as outfile: outfile.write(data['text']) # insert your code here pdftotext("pdfs/muellner2011.pdf") import configparser config = configparser.ConfigParser() config.read('pydf.ini') config.sections() config['filepaths']['pdf directory'] from os.path import basename, splitext def fileid(filepath): """ Return the basename of a file without its extension. >>> fileid('/some/path/to/a/file.pdf') file """ base, _ = splitext(basename(filepath)) return base def pdftotext(pdf, outdir='.', sourcedir='source', p2t='pdftotext', move=False): """Convert a pdf to a text file. Extract the Author and Title and return a dictionary consisting of the author, title, text the source path, the path of the converted text file and the file ID.""" filename = fileid(pdf) htmlpath = os.path.join(outdir, filename + '.html') txtpath = os.path.join(outdir, filename + '.txt') if not os.path.exists(sourcedir): os.mkdir(sourcedir) sourcepath = os.path.join(sourcedir, filename + '.pdf') subprocess.call([p2t, '-enc', 'UTF-8', '-htmlmeta', pdf, htmlpath]) data = parse_html(htmlpath) os.remove(htmlpath) file_action = shutil.move if move else shutil.copy file_action(pdf, sourcepath) with open(txtpath, 'w') as outfile: outfile.write(data['text']) data['source'] = sourcepath data['path'] = txtpath data['id'] = fileid(pdf) return data pdftotext("pdfs/blei2003.pdf", outdir=config.get('filepaths', 'txt directory'), sourcedir=config.get('filepaths', 'source directory'), move=config.getboolean('indexer.options', 'move pdfs')) import glob def index_collection(configpath): "Main routine to index a collection of PDFs using Whoosh." config = configparser.ConfigParser() # read the configuration file # insert your code here recompile = config.getboolean("indexer.options", "recompile") # check whether the supplied index directory already exists if not os.path.exists(config.get("filepaths", "index directory")): # if not, create a new directory and initialize the index os.mkdir(config.get("filepaths", "index directory")) index = create_in(config.get("filepaths", "index directory"), schema=pdf_schema) recompile = True # open a connection to the index index = # insert your code here # retrieve a set of all file IDs we already indexed indexed = set(map(fileid, os.listdir(config.get("filepaths", "txt directory")))) # initialize a IndexWriter object writer = # insert your code here # iterate over all directories for directory in config.get("filepaths", "pdf directory").split(';'): # iterate over all PDF files in this directory for filepath in glob.glob(directory + "/*.pdf"): # poor man's solution to check whether we already indexed this pdf if fileid(filepath) not in indexed or recompile: try: # call the function pdftotext with the correct arguments data = # insert your code here # add the new document to the index writer.add_document(**data) except (IOError, UnicodeDecodeError) as error: print(error) # commit our changes # insert your code here index_collection('pydf.ini') from flask import Flask app = Flask(__name__) @app.route("/") def hello(): return "Hello World!" if __name__ == "__main__": app.run(port=5000) from flask import Flask, render_template app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') if __name__ == '__main__': app.run(debug=True, host='localhost', port=8000, use_reloader=True, threaded=True) from whoosh.index import open_dir from whoosh.qparser import QueryParser def search(query): # insert your code here print(list(search("(topic model) OR (index probability"))) def search(query): # insert your code here print(list(search("(topic model) OR (index probability"))) def to_html(result): "Return a representation of a search result in HTML." title = result['title'] if 'title' in result else result['id'] author = result['author'] if 'author' in result else '' html = """
%s %s
%s
""" % (result['source'], title, author, result['snippet']) return html print(to_html(next(search("topic model")))) from flask import request, jsonify @app.route('/searchbox', methods=['POST']) def searchbox(): query = request.form['q'].strip() html_results = '\n'.join(map(to_html, search(query))) return jsonify({'html': html_results}) from IPython.core.display import HTML def css_styling(): styles = open("styles/custom.css", "r").read() return HTML(styles) css_styling()